Linux 核心使用 time-shared 的排程方法,並且由 Timer Interrupt 觸發。
每當計時器中斷發生時,核心會呼叫 scheduler_tick() 來重新分配 CPU 的使用權,
因此所有的行程是以分時多工的方式被執行,計時器的中斷時間間隔稱為時間片段(time-slice)。
執行程序在 Linux 中被分成兩類:
Active - 尚未用完時間片段的行程
Expired - 已經用完時間片段的行程
可執行的行程會被存放在這兩種佇列之中,而核心的排程器是從 Active 佇列的最高優先等級的程序
選出一個行程來執行,此動作為呼叫 schedule()。當行程的時間片段用完時,便會被移至 Expired 佇列。
執行佇列的優先權有 140 個級別,數字愈小表示其優先次序愈高,
0 到 99 為 real-time 行程的等級,100 到 139 為一般行程的等級(即 User mode 程式)。
上圖是一個核心排程的例子,每次 scheduler_tick() 被呼叫時,核心會取得 CPU 的使用權,
並將目前正在執行的行程的 time-slice 減一,當行程的 time-slice 用完時,核心呼叫 schedule() 挑選另一個行程來執行。
若是執行中的行程因等待某些事件而主動交出 CPU 使用權時,也會經由呼叫 schedule() 切換執行權至另一個行程。
定義於 kernel/sched.c
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
task_t *p = current;
rq->timestamp_last_tick = sched_clock();
if (p == rq->idle) {
if (wake_priority_sleeper(rq))
goto out;
rebalance_tick(cpu, rq, SCHED_IDLE);
return;
}
/* Task might have expired already, but not scheduled off yet */
if (p->array != rq->active) {
set_tsk_need_resched(p);
goto out;
}
spin_lock(&rq->lock);
/*
* The task was running during this tick - update the
* time slice counter. Note: we do not update a thread's
* priority until it either goes to sleep or uses up its
* timeslice. This makes it possible for interactive tasks
* to use up their timeslices at their highest priority levels.
*/
if (rt_task(p)) {
/*
* RR tasks need a special form of timeslice management.
* FIFO tasks have no timeslices.
*/
if ((p->policy == SCHED_RR) && !--p->time_slice) {
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
set_tsk_need_resched(p);
/* put it at the end of the queue: */
requeue_task(p, rq->active);
}
goto out_unlock;
}
if (!--p->time_slice) {
dequeue_task(p, rq->active);
set_tsk_need_resched(p);
p->prio = effective_prio(p);
p->time_slice = task_timeslice(p);
p->first_time_slice = 0;
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
if (!TASK_INTERACTIVE(p) EXPIRED_STARVING(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
rq->best_expired_prio = p->static_prio;
} else
enqueue_task(p, rq->active);
} else {
/*
* Prevent a too long timeslice allowing a task to monopolize
* the CPU. We do this by splitting up the timeslice into
* smaller pieces.
*
* Note: this does not mean the task's timeslices expire or
* get lost in any way, they just might be preempted by
* another task of equal priority. (one with higher
* priority would have preempted this task already.) We
* requeue this task to the end of the list on this priority
* level, which is in essence a round-robin of tasks with
* equal priority.
*
* This only applies to tasks in the interactive
* delta range with at least TIMESLICE_GRANULARITY to requeue.
*/
if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
(p->array == rq->active)) {
requeue_task(p, rq->active);
set_tsk_need_resched(p);
}
}
out_unlock:
spin_unlock(&rq->lock);
out:
rebalance_tick(cpu, rq, NOT_IDLE);
}
/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
{
long *switch_count;
task_t *prev, *next;
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int cpu, idx;
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
* Otherwise, whine if we are scheduling when we should not be.
*/
if (likely(!current->exit_state)) {
if (unlikely(in_atomic())) {
printk(KERN_ERR "scheduling while atomic: "
"%s/0x%08x/%d\n",
current->comm, preempt_count(), current->pid);
dump_stack();
}
}
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
need_resched:
preempt_disable();
prev = current;
release_kernel_lock(prev);
need_resched_nonpreemptible:
rq = this_rq();
/*
* The idle thread is not allowed to schedule!
* Remove this check after it has been exercised a bit.
*/
if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
printk(KERN_ERR "bad: scheduling from the idle thread!\n");
dump_stack();
}
schedstat_inc(rq, sched_cnt);
now = sched_clock();
if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) run_time = now - prev->timestamp;
else
run_time = NS_MAX_SLEEP_AVG;
/*
* Tasks charged proportionately less run_time at high sleep_avg to
* delay them losing their interactive status
*/
run_time /= (CURRENT_BONUS(prev) ? : 1);
spin_lock_irq(&rq->lock);
if (unlikely(prev->flags & PF_DEAD))
prev->state = EXIT_DEAD;
switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
unlikely(signal_pending(prev))))
prev->state = TASK_RUNNING;
else {
if (prev->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
deactivate_task(prev, rq);
}
}
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
go_idle:
idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
wake_sleeping_dependent(cpu, rq);
/*
* wake_sleeping_dependent() might have released
* the runqueue, so break out if we got new
* tasks meanwhile:
*/
if (!rq->nr_running)
goto switch_tasks;
}
} else {
if (dependent_sleeper(cpu, rq)) {
next = rq->idle;
goto switch_tasks;
}
/*
* dependent_sleeper() releases and reacquires the runqueue
* lock, hence go into the idle loop if the rq went
* empty meanwhile:
*/
if (unlikely(!rq->nr_running))
goto go_idle;
}
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
schedstat_inc(rq, sched_switch);
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
rq->best_expired_prio = MAX_PRIO;
} else
schedstat_inc(rq, sched_noswitch);
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
if (!rt_task(next) && next->activated > 0) {
unsigned long long delta = now - next->timestamp;
if (next->activated == 1)
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
dequeue_task(next, array);
recalc_task_prio(next, next->timestamp + delta);
enqueue_task(next, array);
}
next->activated = 0;
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0) prev->sleep_avg = 0;
prev->timestamp = prev->last_ran = now;
sched_info_switch(prev, next);
if (likely(prev != next)) {
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
++*switch_count;
prepare_arch_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
finish_task_switch(prev);
} else
spin_unlock_irq(&rq->lock);
prev = current;
if (unlikely(reacquire_kernel_lock(prev) < 0))
goto need_resched_nonpreemptible;
preempt_enable_no_resched();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
2011年5月5日 星期四
訂閱:
張貼留言 (Atom)
沒有留言:
張貼留言