Parrot's Note Book: Process Scheduling

Linux 核心使用 time-shared 的排程方法，並且由 Timer Interrupt 觸發。
每當計時器中斷發生時，核心會呼叫 scheduler_tick() 來重新分配 CPU 的使用權，
因此所有的行程是以分時多工的方式被執行，計時器的中斷時間間隔稱為時間片段(time-slice)。

執行程序在 Linux 中被分成兩類：

Active - 尚未用完時間片段的行程
Expired - 已經用完時間片段的行程

可執行的行程會被存放在這兩種佇列之中，而核心的排程器是從 Active 佇列的最高優先等級的程序
選出一個行程來執行，此動作為呼叫 schedule()。當行程的時間片段用完時，便會被移至 Expired 佇列。

執行佇列的優先權有 140 個級別，數字愈小表示其優先次序愈高，
0 到 99 為 real-time 行程的等級，100 到 139 為一般行程的等級(即 User mode 程式)。

上圖是一個核心排程的例子，每次 scheduler_tick() 被呼叫時，核心會取得 CPU 的使用權，
並將目前正在執行的行程的 time-slice 減一，當行程的 time-slice 用完時，核心呼叫 schedule() 挑選另一個行程來執行。
若是執行中的行程因等待某些事件而主動交出 CPU 使用權時，也會經由呼叫 schedule() 切換執行權至另一個行程。

定義於 kernel/sched.c

/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
　　int cpu = smp_processor_id();
　　runqueue_t *rq = this_rq();
　　task_t *p = current;

　　rq->timestamp_last_tick = sched_clock();

　　if (p == rq->idle) {
　　　　if (wake_priority_sleeper(rq))
　　　　　　goto out;
　　　　rebalance_tick(cpu, rq, SCHED_IDLE);
　　　　return;
　　}

　　/* Task might have expired already, but not scheduled off yet */
　　if (p->array != rq->active) {
　　　　set_tsk_need_resched(p);
　　　　goto out;
　　}
　　spin_lock(&rq->lock);
　　/*
　　 * The task was running during this tick - update the
　　 * time slice counter. Note: we do not update a thread's
　　 * priority until it either goes to sleep or uses up its
　　 * timeslice. This makes it possible for interactive tasks
　　 * to use up their timeslices at their highest priority levels.
　　 */
　　if (rt_task(p)) {
　　　　/*
　　　　 * RR tasks need a special form of timeslice management.
　　　　 * FIFO tasks have no timeslices.
　　　　 */
　　　　if ((p->policy == SCHED_RR) && !--p->time_slice) {
　　　　　　p->time_slice = task_timeslice(p);
　　　　　　p->first_time_slice = 0;
　　　　　　set_tsk_need_resched(p);

　　　　　　/* put it at the end of the queue: */
　　　　　　requeue_task(p, rq->active);
　　　　}
　　　　goto out_unlock;
　　}
　　if (!--p->time_slice) {
　　　　dequeue_task(p, rq->active);
　　　　set_tsk_need_resched(p);
　　　　p->prio = effective_prio(p);
　　　　p->time_slice = task_timeslice(p);
　　　　p->first_time_slice = 0;

　　　　if (!rq->expired_timestamp)
　　　　　　rq->expired_timestamp = jiffies;
　　　　if (!TASK_INTERACTIVE(p) EXPIRED_STARVING(rq)) {
　　　　　　enqueue_task(p, rq->expired);
　　　　　　if (p->static_prio < rq->best_expired_prio)
　　　　　　　　rq->best_expired_prio = p->static_prio;
　　　　} else
　　　　　　enqueue_task(p, rq->active);
　　} else {
　　　　/*
　　　　 * Prevent a too long timeslice allowing a task to monopolize
　　　　 * the CPU. We do this by splitting up the timeslice into
　　　　 * smaller pieces.
　　　　 *
　　　　 * Note: this does not mean the task's timeslices expire or
　　　　 * get lost in any way, they just might be preempted by
　　　　 * another task of equal priority. (one with higher
　　　　 * priority would have preempted this task already.) We
　　　　 * requeue this task to the end of the list on this priority
　　　　 * level, which is in essence a round-robin of tasks with
　　　　 * equal priority.
　　　　 *
　　　　 * This only applies to tasks in the interactive
　　　　 * delta range with at least TIMESLICE_GRANULARITY to requeue.
　　　　 */
　　　　if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
　　　　　　p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
　　　　　　(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
　　　　　　(p->array == rq->active)) {

　　　　　　requeue_task(p, rq->active);
　　　　　　set_tsk_need_resched(p);
　　　　}
　　}
out_unlock:
　　spin_unlock(&rq->lock);
out:
　　rebalance_tick(cpu, rq, NOT_IDLE);
}

/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
{
　　long *switch_count;
　　task_t *prev, *next;
　　runqueue_t *rq;
　　prio_array_t *array;
　　struct list_head *queue;
　　unsigned long long now;
　　unsigned long run_time;
　　int cpu, idx;

　　/*
　　 * Test if we are atomic. Since do_exit() needs to call into
　　 * schedule() atomically, we ignore that path for now.
　　 * Otherwise, whine if we are scheduling when we should not be.
　　 */
　　if (likely(!current->exit_state)) {
　　　　if (unlikely(in_atomic())) {
　　　　　　printk(KERN_ERR "scheduling while atomic: "
　　　　　　　　"%s/0x%08x/%d\n",
　　　　　　　　current->comm, preempt_count(), current->pid);
　　　　　　dump_stack();
　　　　}
　　}
　　profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
　　preempt_disable();
　　prev = current;
　　release_kernel_lock(prev);
need_resched_nonpreemptible:
　　rq = this_rq();

　　/*
　　 * The idle thread is not allowed to schedule!
　　 * Remove this check after it has been exercised a bit.
　　 */
　　if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
　　　　printk(KERN_ERR "bad: scheduling from the idle thread!\n");
　　　　dump_stack();
　　}

　　schedstat_inc(rq, sched_cnt);
　　now = sched_clock();
　　if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) 　　　　run_time = now - prev->timestamp;
　　else
　　　　run_time = NS_MAX_SLEEP_AVG;

　　/*
　　 * Tasks charged proportionately less run_time at high sleep_avg to
　　 * delay them losing their interactive status
　　 */
　　run_time /= (CURRENT_BONUS(prev) ? : 1);

　　spin_lock_irq(&rq->lock);

　　if (unlikely(prev->flags & PF_DEAD))
　　　　prev->state = EXIT_DEAD;

　　switch_count = &prev->nivcsw;
　　if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
　　　　switch_count = &prev->nvcsw;
　　　　if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
　　　　　　　　unlikely(signal_pending(prev))))
　　　　　　prev->state = TASK_RUNNING;
　　　　else {
　　　　　　if (prev->state == TASK_UNINTERRUPTIBLE)
　　　　　　　　rq->nr_uninterruptible++;
　　　　　　deactivate_task(prev, rq);
　　　　}
　　}

　　cpu = smp_processor_id();
　　if (unlikely(!rq->nr_running)) {
go_idle:
　　　　idle_balance(cpu, rq);
　　　　if (!rq->nr_running) {
　　　　　　next = rq->idle;
　　　　　　rq->expired_timestamp = 0;
　　　　　　wake_sleeping_dependent(cpu, rq);
　　　　　　/*
　　　　　　 * wake_sleeping_dependent() might have released
　　　　　　 * the runqueue, so break out if we got new
　　　　　　 * tasks meanwhile:
　　　　　　 */
　　　　　　if (!rq->nr_running)
　　　　　　　　goto switch_tasks;
　　　　}
　　} else {
　　　　if (dependent_sleeper(cpu, rq)) {
　　　　　　next = rq->idle;
　　　　　　goto switch_tasks;
　　　　}
　　　　/*
　　　　 * dependent_sleeper() releases and reacquires the runqueue
　　　　 * lock, hence go into the idle loop if the rq went
　　　　 * empty meanwhile:
　　　　 */
　　　　if (unlikely(!rq->nr_running))
　　　　　　goto go_idle;
　　}

　　array = rq->active;
　　if (unlikely(!array->nr_active)) {
　　　　/*
　　　　 * Switch the active and expired arrays.
　　　　 */
　　　　schedstat_inc(rq, sched_switch);
　　　　rq->active = rq->expired;
　　　　rq->expired = array;
　　　　array = rq->active;
　　　　rq->expired_timestamp = 0;
　　　　rq->best_expired_prio = MAX_PRIO;
　　} else
　　　　schedstat_inc(rq, sched_noswitch);

　　idx = sched_find_first_bit(array->bitmap);
　　queue = array->queue + idx;
　　next = list_entry(queue->next, task_t, run_list);

　　if (!rt_task(next) && next->activated > 0) {
　　　　unsigned long long delta = now - next->timestamp;

　　　　if (next->activated == 1)
　　　　　　delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

　　　　array = next->array;
　　　　dequeue_task(next, array);
　　　　recalc_task_prio(next, next->timestamp + delta);
　　　　enqueue_task(next, array);
　　}
　　next->activated = 0;
switch_tasks:
　　if (next == rq->idle)
　　　　schedstat_inc(rq, sched_goidle);
　　prefetch(next);
　　clear_tsk_need_resched(prev);
　　rcu_qsctr_inc(task_cpu(prev));

　　prev->sleep_avg -= run_time;
　　if ((long)prev->sleep_avg <= 0) 　　　　prev->sleep_avg = 0;
　　prev->timestamp = prev->last_ran = now;

　　sched_info_switch(prev, next);
　　if (likely(prev != next)) {
　　　　next->timestamp = now;
　　　　rq->nr_switches++;
　　　　rq->curr = next;
　　　　++*switch_count;

　　　　prepare_arch_switch(rq, next);
　　　　prev = context_switch(rq, prev, next);
　　　　barrier();

　　　　finish_task_switch(prev);
　　} else
　　　　spin_unlock_irq(&rq->lock);

　　prev = current;
　　if (unlikely(reacquire_kernel_lock(prev) < 0))
　　　　goto need_resched_nonpreemptible;
　　preempt_enable_no_resched();
　　if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
　　　　goto need_resched;
}

Parrot's Note Book

2011年5月5日星期四

Process Scheduling - 1

沒有留言:

關於我自己

標籤

網誌存檔