Parrot's Note Book: 2011

2011年8月7日星期日

Process Scheduling - 4

一般行程的優先順序的值是 100 (highest priority) 至 139 (lowest priority)，行程一開始時會被賦予初始的優先順序，
稱為 static priority；之後在執行的期間，排程器會藉由動態增減的方式來調整優先順序，此為 dynamic priority。

real-time 行程的優先順序則是 1 (highest priority) 至 99 (lowest priority)，又稱為 real-time priority。

Static priority
用來決定行程的 CPU 使用時間(base time quantum)，優先順序愈高(數字愈小)，執行時間也就愈長。
行程的 static priority 是繼承其父行程而來，也可以藉由設定 nice 值來做修改。

Dynamic priority
作為排程器挑選行程來執行的優先順序，dynamic priority 會在行程執行期間隨 bonus 值調整。
bonus 是一個 0 至 10 的數值；小於 5 時，表示對行程的優先權做調降；大於 5 時，則是做調升。

下表列出與行程的優先權相關的參數：

定義於 include/linux/sched.h

/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are
* in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values
* are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/

#define MAX_USER_RT_PRIO　　100
#define MAX_RT_PRIO 　　　　　MAX_USER_RT_PRIO

#define MAX_PRIO　　　　　　　(MAX_RT_PRIO + 40)

#define rt_task(p)　　　　　　　　(unlikely((p)->prio < MAX_RT_PRIO))

定義於 kernel/sched.c

/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice)　(MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio)　((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p)　　　　PRIO_TO_NICE((p)->static_prio)

/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p)　　　　((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p)　USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO　　(USER_PRIO(MAX_PRIO))

/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
*
* The higher a thread's priority, the bigger timeslices
* it gets during one round of execution. But even the lowest
* priority thread gets MIN_TIMESLICE worth of execution time.
*/

#define SCALE_PRIO(x, prio) \
　max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)

static unsigned int task_timeslice(task_t *p)
{
　if (p->static_prio < NICE_TO_PRIO(0))
　　return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
　else
　　return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
}

/*
* effective_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
*
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
* into the -5 ... 0 ... +5 bonus/penalty range.
*
* We use 25% of the full 0...39 priority range so that:
*
* 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
* 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
*
* Both properties are important to certain workloads.
*/
static int effective_prio(task_t *p)
{
　int bonus, prio;

　if (rt_task(p))
　　return p->prio;

　bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;

　prio = p->static_prio - bonus;
　if (prio < MAX_RT_PRIO)
　　prio = MAX_RT_PRIO;
　if (prio > MAX_PRIO-1)
　　prio = MAX_PRIO-1;
　return prio;
}

/**
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
* This is the priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
int task_prio(const task_t *p)
{
　return p->prio - MAX_RT_PRIO;
}

/**
* task_nice - return the nice value of a given task.
* @p: the task in question.
*/
int task_nice(const task_t *p)
{
　return TASK_NICE(p);
}

2011年6月11日星期六

Process Scheduling - 3

• wait queue

相對於 run queue 的是 wait queue，這是一個存放休眠中的行程的佇列。
由型態為 wait_queue_t 的雙向鏈結串列所組成，並具有一個型態為 wait_queue_head_t 的串列首。

wait_queue_t 結構中的 flags 值為 1 時，進入休眠的行程是 exclusive process，表示只有該行程在等待特定的系統資源。
若 flags 值為 0 時，則是 nonexclusive process，表示有多個行程在等待相同的系統資源。

而 func 欄位會在初始化時設定成 default_wake_function()，這是 exclusive process 的預設喚醒函式。

wait queue head 可以靜態的宣告：

DECLARE_WAIT_QUEUE_HEAD( wait_head );

或是動態的宣告：

wait_queue_head_t wait_head;

init_waitqueue_head( &wait_head );

要讓行程休眠可以呼叫下列巨集函式，將該行程加入一個等待佇列中，並交出 CPU 使用權。

wait_event(wait_head, condition)

wait_event_interruptible(wait_head, condition)

wait_event_timeout(wait_head, condition, timeout)

wait_event_interruptible_timeout(wait_head, condition, timeout)

要喚醒休眠中的行程可以呼叫下列巨集函式，被喚醒的行程會回到執行佇列之中。

wake_up( wait_head_pt )

wake_up_interruptible( wait_head_pt )

定義於 include/linux/wait.h

typedef struct __wait_queue wait_queue_t;
typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key);
int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);

struct __wait_queue {
　　unsigned int flags;
#define WQ_FLAG_EXCLUSIVE　0x01
　　struct task_struct * task;
　　wait_queue_func_t func;
　　struct list_head task_list;
};

struct wait_bit_key {
　　void *flags;
　　int bit_nr;
};

struct wait_bit_queue {
　　struct wait_bit_key key;
　　wait_queue_t wait;
};

struct __wait_queue_head {
　　spinlock_t lock;
　　struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

#define __wait_event(wq, condition)　　　　　　　　　　　　　　　\
do {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 \
　　DEFINE_WAIT(__wait);　　　　　　　　　　　　　　　　　　　\
　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　 \
　　for (;;) {　　　　　　　　　　　　　　　　　　　　　　　　　　\
　　　　prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);　\
　　　　if (condition)　　　　　　　　　　　　　　　　　　　　　　 \
　　　　　　break;　　　　　　　　　　　　　　　　　　　　　　　\
　　　　schedule();　　　　　　　　　　　　　　　　　　　　　　　\
　　}　　　　　　　　　　　　　　　　　　　　　　　　　　　　　\
　　finish_wait(&wq, &__wait);　　　　　　　　　　　　　　　　　 \
} while (0)

#define wait_event(wq, condition)　　　　　　　　　　　　　　　　 \
do {　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　\
　　if (condition)　　　　　　　　　　　　　　　　　　　　　　　　\
　　　　break;　　　　　　　　　　　　　　　　　　　　　　　　　\
　　__wait_event(wq, condition);　　　　　　　　　　　　　　　　 \
} while (0)

void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key));

#define wake_up(x)　　　　　　　　　　__wake_up(x, TASK_UNINTERRUPTIBLE TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible(x)　　__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)

定義於 kernel/wait.c

/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
* wake-function that tests for the wait-queue being active
* will be guaranteed to see waitqueue addition _or_ subsequent
* tests in this thread will see the wakeup having taken place.
*
* The spin_unlock() itself is semi-permeable and only protects
* one way (it only protects stuff inside the critical region and
* stops them from bleeding out - it would still allow subsequent
* loads to move into the the critical region).
*/
void fastcall
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
　　unsigned long flags;

　　wait->flags &= ~WQ_FLAG_EXCLUSIVE;
　　spin_lock_irqsave(&q->lock, flags);
　　if (list_empty(&wait->task_list))
　　　　__add_wait_queue(q, wait);
　　/*
　　 * don't alter the task state if this is just going to
　　 * queue an async wait queue callback
　　 */
　　if (is_sync_wait(wait))
　　　　set_current_state(state);
　　spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);

定義於 kernel/sched.c

/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
　　　　　　　　　　　　　　 int nr_exclusive, int sync, void *key)
{
　　struct list_head *tmp, *next;

　　list_for_each_safe(tmp, next, &q->task_list) {
　　　　wait_queue_t *curr;
　　　　unsigned flags;
　　　　curr = list_entry(tmp, wait_queue_t, task_list);
　　　　flags = curr->flags;
　　　　if (curr->func(curr, mode, sync, key) &&
　　　　　(flags & WQ_FLAG_EXCLUSIVE) &&
　　　　　!--nr_exclusive)
　　　　　　break;
　　}
}

/**
* __wake_up - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
　　　　　　　　　　　　int nr_exclusive, void *key)
{
　　unsigned long flags;

　　spin_lock_irqsave(&q->lock, flags);
　　__wake_up_common(q, mode, nr_exclusive, 0, key);
　　spin_unlock_irqrestore(&q->lock, flags);
}

EXPORT_SYMBOL(__wake_up);

/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
* @state: the mask of task states that can be woken
* @sync: do a synchronous wakeup?
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
* re-schedule is in progress), and as such you're allowed to do
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
* returns failure only if the task is already active.
*/
static int try_to_wake_up(task_t *p, unsigned int state, int sync)
{
　　int cpu, this_cpu, success = 0;
　　unsigned long flags;
　　long old_state;
　　runqueue_t *rq;
#ifdef CONFIG_SMP
　　unsigned long load, this_load;
　　struct sched_domain *sd;
　　int new_cpu;
#endif

　　rq = task_rq_lock(p, &flags);
　　schedstat_inc(rq, ttwu_cnt);
　　old_state = p->state;
　　if (!(old_state & state))
　　　　goto out;

　　if (p->array)
　　　　goto out_running;

　　cpu = task_cpu(p);
　　this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
　　if (unlikely(task_running(rq, p)))
　　　　goto out_activate;

　　new_cpu = cpu;

　　if (cpu == this_cpu unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
　　　　goto out_set_cpu;

　　load = source_load(cpu);
　　this_load = target_load(this_cpu);

　　/*
　　 * If sync wakeup then subtract the (maximum possible) effect of
　　 * the currently running task from the load of the current CPU:
　　 */
　　if (sync)
　　　　this_load -= SCHED_LOAD_SCALE;

　　/* Don't pull the task off an idle CPU to a busy one */
　　if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
　　　　goto out_set_cpu;

　　new_cpu = this_cpu; /* Wake to this CPU if we can */

　　/*
　　 * Scan domains for affine wakeup and passive balancing
　　 * possibilities.
　　 */
　　for_each_domain(this_cpu, sd) {
　　　　unsigned int imbalance;
　　　　/*
　　　　 * Start passive balancing when half the imbalance_pct
　　　　 * limit is reached.
　　　　 */
　　　　imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;

　　　　if ((sd->flags & SD_WAKE_AFFINE) &&
　　　　　　　　!task_hot(p, rq->timestamp_last_tick, sd)) {
　　　　　　/*
　　　　　　 * This domain has SD_WAKE_AFFINE and p is cache cold
　　　　　　 * in this domain.
　　　　　　 */
　　　　　　if (cpu_isset(cpu, sd->span)) {
　　　　　　　　schedstat_inc(sd, ttwu_wake_affine);
　　　　　　　　goto out_set_cpu;
　　　　　　}
　　　　} else if ((sd->flags & SD_WAKE_BALANCE) &&
　　　　　　　　imbalance*this_load <= 100*load) {
　　　　　　/*
　　　　　　 * This domain has SD_WAKE_BALANCE and there is
　　　　　　 * an imbalance.
　　　　　　 */
　　　　　　if (cpu_isset(cpu, sd->span)) {
　　　　　　　　schedstat_inc(sd, ttwu_wake_balance);
　　　　　　　　goto out_set_cpu;
　　　　　　}
　　　　}
　　}

　　new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
　　schedstat_inc(rq, ttwu_attempts);
　　new_cpu = wake_idle(new_cpu, p);
　　if (new_cpu != cpu) {
　　　　schedstat_inc(rq, ttwu_moved);
　　　　set_task_cpu(p, new_cpu);
　　　　task_rq_unlock(rq, &flags);
　　　　/* might preempt at this point */
　　　　rq = task_rq_lock(p, &flags);
　　　　old_state = p->state;
　　　　if (!(old_state & state))
　　　　　　goto out;
　　　　if (p->array)
　　　　　　goto out_running;

　　　　this_cpu = smp_processor_id();
　　　　cpu = task_cpu(p);
　　}

out_activate:
#endif /* CONFIG_SMP */
　　if (old_state == TASK_UNINTERRUPTIBLE) {
　　　　rq->nr_uninterruptible--;
　　　　/*
　　　　 * Tasks on involuntary sleep don't earn
　　　　 * sleep_avg beyond just interactive state.
　　　　 */
　　　　p->activated = -1;
　　}

　　/*
　　 * Sync wakeups (i.e. those types of wakeups where the waker
　　 * has indicated that it will leave the CPU in short order)
　　 * don't trigger a preemption, if the woken up task will run on
　　 * this cpu. (in this case the 'I will reschedule' promise of
　　 * the waker guarantees that the freshly woken up task is going
　　 * to be considered on this CPU.)
　　 */
　　activate_task(p, rq, cpu == this_cpu);
　　if (!sync cpu != this_cpu) {
　　　　if (TASK_PREEMPTS_CURR(p, rq))
　　　　　　resched_task(rq->curr);
　　}
　　success = 1;

out_running:
　　p->state = TASK_RUNNING;
out:
　　task_rq_unlock(rq, &flags);

　　return success;
}

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
{
　　task_t *p = curr->task;
　　return try_to_wake_up(p, mode, sync);
}

EXPORT_SYMBOL(default_wake_function);

2011年5月15日星期日

Process Scheduling - 2

• run queue

Linux 核心將所有可執行的行程(其狀態為 TASK_RUNNING)記錄於 run queue 之中，除了 swapper 行程。
若是多處理器的系統，則每一個 CPU 會有自己的 run queue 變數。

定義於 kernel/sched.c

struct prio_array {
　　unsigned int nr_active;
　　unsigned long bitmap[BITMAP_SIZE];
　　struct list_head queue[MAX_PRIO];
};

struct runqueue {
　　spinlock_t lock;
　　
　　/*
　　 * nr_running and cpu_load should be in the same cacheline because
　　 * remote CPUs use both these fields when doing load calculation.
　　 */
　　unsigned long nr_running;
#ifdef CONFIG_SMP
　　unsigned long cpu_load;
#endif
　　unsigned long long nr_switches;

　　/*
　　 * This is part of a global counter where only the total sum
　　 * over all CPUs matters. A task can increase this counter on
　　 * one CPU and if it got migrated afterwards it may decrease
　　 * it on another CPU. Always updated under the runqueue lock:
　　 */
　　unsigned long nr_uninterruptible;
　　
　　unsigned long expired_timestamp;
　　unsigned long long timestamp_last_tick;
　　task_t *curr, *idle;
　　struct mm_struct *prev_mm;
　　prio_array_t *active, *expired, arrays[2];
　　int best_expired_prio;
　　atomic_t nr_iowait;

#ifdef CONFIG_SMP
　　struct sched_domain *sd;
　　
　　/* For active balancing */
　　int active_balance;
　　int push_cpu;
　　
　　task_t *migration_thread;
　　struct list_head migration_queue;
#endif

#ifdef CONFIG_SCHEDSTATS
　　/* latency stats */
　　struct sched_info rq_sched_info;
　　
　　/* sys_sched_yield() stats */
　　unsigned long yld_exp_empty;
　　unsigned long yld_act_empty;
　　unsigned long yld_both_empty;
　　unsigned long yld_cnt;
　　
　　/* schedule() stats */
　　unsigned long sched_noswitch;
　　unsigned long sched_switch;
　　unsigned long sched_cnt;
　　unsigned long sched_goidle;
　　
　　/* pull_task() stats */
　　unsigned long pt_gained[MAX_IDLE_TYPES];
　　unsigned long pt_lost[MAX_IDLE_TYPES];
　　
　　/* active_load_balance() stats */
　　unsigned long alb_cnt;
　　unsigned long alb_lost;
　　unsigned long alb_gained;
　　unsigned long alb_failed;
　　
　　/* try_to_wake_up() stats */
　　unsigned long ttwu_cnt;
　　unsigned long ttwu_attempts;
　　unsigned long ttwu_moved;
　　
　　/* wake_up_new_task() stats */
　　unsigned long wunt_cnt;
　　unsigned long wunt_moved;
　　
　　/* sched_migrate_task() stats */
　　unsigned long smt_cnt;
　　
　　/* sched_balance_exec() stats */
　　unsigned long sbe_cnt;
#endif
};

runqueue 結構中名稱為 arrays 的欄位，是具有 2 個 prio_array_t 型別(即 struct prio_array)的陣列，
arrays[0] 與 arrays[1] 都包含 140 個雙向鏈結串列的串列首，用來記錄優先等級 0 到 139 的可執行的行程。
另外有 2 個指標會分別指向 arrays[0] 與 arrays[1]，代表 active 與 expired 的行程集合，如下圖所示。

在系統的運作過程中，active 行程用完其執行時間就變成 expired 行程。而當 active 的佇列為空時，
expired 的佇列就會與 active 的佇列互換，便是將這 2 個指標指向不同的 arrays 的位置來達成。

2011年5月5日星期四

Process Scheduling - 1

Linux 核心使用 time-shared 的排程方法，並且由 Timer Interrupt 觸發。
每當計時器中斷發生時，核心會呼叫 scheduler_tick() 來重新分配 CPU 的使用權，
因此所有的行程是以分時多工的方式被執行，計時器的中斷時間間隔稱為時間片段(time-slice)。

執行程序在 Linux 中被分成兩類：

Active - 尚未用完時間片段的行程
Expired - 已經用完時間片段的行程

可執行的行程會被存放在這兩種佇列之中，而核心的排程器是從 Active 佇列的最高優先等級的程序
選出一個行程來執行，此動作為呼叫 schedule()。當行程的時間片段用完時，便會被移至 Expired 佇列。

執行佇列的優先權有 140 個級別，數字愈小表示其優先次序愈高，
0 到 99 為 real-time 行程的等級，100 到 139 為一般行程的等級(即 User mode 程式)。

上圖是一個核心排程的例子，每次 scheduler_tick() 被呼叫時，核心會取得 CPU 的使用權，
並將目前正在執行的行程的 time-slice 減一，當行程的 time-slice 用完時，核心呼叫 schedule() 挑選另一個行程來執行。
若是執行中的行程因等待某些事件而主動交出 CPU 使用權時，也會經由呼叫 schedule() 切換執行權至另一個行程。

定義於 kernel/sched.c

/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
　　int cpu = smp_processor_id();
　　runqueue_t *rq = this_rq();
　　task_t *p = current;

　　rq->timestamp_last_tick = sched_clock();

　　if (p == rq->idle) {
　　　　if (wake_priority_sleeper(rq))
　　　　　　goto out;
　　　　rebalance_tick(cpu, rq, SCHED_IDLE);
　　　　return;
　　}

　　/* Task might have expired already, but not scheduled off yet */
　　if (p->array != rq->active) {
　　　　set_tsk_need_resched(p);
　　　　goto out;
　　}
　　spin_lock(&rq->lock);
　　/*
　　 * The task was running during this tick - update the
　　 * time slice counter. Note: we do not update a thread's
　　 * priority until it either goes to sleep or uses up its
　　 * timeslice. This makes it possible for interactive tasks
　　 * to use up their timeslices at their highest priority levels.
　　 */
　　if (rt_task(p)) {
　　　　/*
　　　　 * RR tasks need a special form of timeslice management.
　　　　 * FIFO tasks have no timeslices.
　　　　 */
　　　　if ((p->policy == SCHED_RR) && !--p->time_slice) {
　　　　　　p->time_slice = task_timeslice(p);
　　　　　　p->first_time_slice = 0;
　　　　　　set_tsk_need_resched(p);

　　　　　　/* put it at the end of the queue: */
　　　　　　requeue_task(p, rq->active);
　　　　}
　　　　goto out_unlock;
　　}
　　if (!--p->time_slice) {
　　　　dequeue_task(p, rq->active);
　　　　set_tsk_need_resched(p);
　　　　p->prio = effective_prio(p);
　　　　p->time_slice = task_timeslice(p);
　　　　p->first_time_slice = 0;

　　　　if (!rq->expired_timestamp)
　　　　　　rq->expired_timestamp = jiffies;
　　　　if (!TASK_INTERACTIVE(p) EXPIRED_STARVING(rq)) {
　　　　　　enqueue_task(p, rq->expired);
　　　　　　if (p->static_prio < rq->best_expired_prio)
　　　　　　　　rq->best_expired_prio = p->static_prio;
　　　　} else
　　　　　　enqueue_task(p, rq->active);
　　} else {
　　　　/*
　　　　 * Prevent a too long timeslice allowing a task to monopolize
　　　　 * the CPU. We do this by splitting up the timeslice into
　　　　 * smaller pieces.
　　　　 *
　　　　 * Note: this does not mean the task's timeslices expire or
　　　　 * get lost in any way, they just might be preempted by
　　　　 * another task of equal priority. (one with higher
　　　　 * priority would have preempted this task already.) We
　　　　 * requeue this task to the end of the list on this priority
　　　　 * level, which is in essence a round-robin of tasks with
　　　　 * equal priority.
　　　　 *
　　　　 * This only applies to tasks in the interactive
　　　　 * delta range with at least TIMESLICE_GRANULARITY to requeue.
　　　　 */
　　　　if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
　　　　　　p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
　　　　　　(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
　　　　　　(p->array == rq->active)) {

　　　　　　requeue_task(p, rq->active);
　　　　　　set_tsk_need_resched(p);
　　　　}
　　}
out_unlock:
　　spin_unlock(&rq->lock);
out:
　　rebalance_tick(cpu, rq, NOT_IDLE);
}

/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
{
　　long *switch_count;
　　task_t *prev, *next;
　　runqueue_t *rq;
　　prio_array_t *array;
　　struct list_head *queue;
　　unsigned long long now;
　　unsigned long run_time;
　　int cpu, idx;

　　/*
　　 * Test if we are atomic. Since do_exit() needs to call into
　　 * schedule() atomically, we ignore that path for now.
　　 * Otherwise, whine if we are scheduling when we should not be.
　　 */
　　if (likely(!current->exit_state)) {
　　　　if (unlikely(in_atomic())) {
　　　　　　printk(KERN_ERR "scheduling while atomic: "
　　　　　　　　"%s/0x%08x/%d\n",
　　　　　　　　current->comm, preempt_count(), current->pid);
　　　　　　dump_stack();
　　　　}
　　}
　　profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
　　preempt_disable();
　　prev = current;
　　release_kernel_lock(prev);
need_resched_nonpreemptible:
　　rq = this_rq();

　　/*
　　 * The idle thread is not allowed to schedule!
　　 * Remove this check after it has been exercised a bit.
　　 */
　　if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
　　　　printk(KERN_ERR "bad: scheduling from the idle thread!\n");
　　　　dump_stack();
　　}

　　schedstat_inc(rq, sched_cnt);
　　now = sched_clock();
　　if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) 　　　　run_time = now - prev->timestamp;
　　else
　　　　run_time = NS_MAX_SLEEP_AVG;

　　/*
　　 * Tasks charged proportionately less run_time at high sleep_avg to
　　 * delay them losing their interactive status
　　 */
　　run_time /= (CURRENT_BONUS(prev) ? : 1);

　　spin_lock_irq(&rq->lock);

　　if (unlikely(prev->flags & PF_DEAD))
　　　　prev->state = EXIT_DEAD;

　　switch_count = &prev->nivcsw;
　　if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
　　　　switch_count = &prev->nvcsw;
　　　　if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
　　　　　　　　unlikely(signal_pending(prev))))
　　　　　　prev->state = TASK_RUNNING;
　　　　else {
　　　　　　if (prev->state == TASK_UNINTERRUPTIBLE)
　　　　　　　　rq->nr_uninterruptible++;
　　　　　　deactivate_task(prev, rq);
　　　　}
　　}

　　cpu = smp_processor_id();
　　if (unlikely(!rq->nr_running)) {
go_idle:
　　　　idle_balance(cpu, rq);
　　　　if (!rq->nr_running) {
　　　　　　next = rq->idle;
　　　　　　rq->expired_timestamp = 0;
　　　　　　wake_sleeping_dependent(cpu, rq);
　　　　　　/*
　　　　　　 * wake_sleeping_dependent() might have released
　　　　　　 * the runqueue, so break out if we got new
　　　　　　 * tasks meanwhile:
　　　　　　 */
　　　　　　if (!rq->nr_running)
　　　　　　　　goto switch_tasks;
　　　　}
　　} else {
　　　　if (dependent_sleeper(cpu, rq)) {
　　　　　　next = rq->idle;
　　　　　　goto switch_tasks;
　　　　}
　　　　/*
　　　　 * dependent_sleeper() releases and reacquires the runqueue
　　　　 * lock, hence go into the idle loop if the rq went
　　　　 * empty meanwhile:
　　　　 */
　　　　if (unlikely(!rq->nr_running))
　　　　　　goto go_idle;
　　}

　　array = rq->active;
　　if (unlikely(!array->nr_active)) {
　　　　/*
　　　　 * Switch the active and expired arrays.
　　　　 */
　　　　schedstat_inc(rq, sched_switch);
　　　　rq->active = rq->expired;
　　　　rq->expired = array;
　　　　array = rq->active;
　　　　rq->expired_timestamp = 0;
　　　　rq->best_expired_prio = MAX_PRIO;
　　} else
　　　　schedstat_inc(rq, sched_noswitch);

　　idx = sched_find_first_bit(array->bitmap);
　　queue = array->queue + idx;
　　next = list_entry(queue->next, task_t, run_list);

　　if (!rt_task(next) && next->activated > 0) {
　　　　unsigned long long delta = now - next->timestamp;

　　　　if (next->activated == 1)
　　　　　　delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

　　　　array = next->array;
　　　　dequeue_task(next, array);
　　　　recalc_task_prio(next, next->timestamp + delta);
　　　　enqueue_task(next, array);
　　}
　　next->activated = 0;
switch_tasks:
　　if (next == rq->idle)
　　　　schedstat_inc(rq, sched_goidle);
　　prefetch(next);
　　clear_tsk_need_resched(prev);
　　rcu_qsctr_inc(task_cpu(prev));

　　prev->sleep_avg -= run_time;
　　if ((long)prev->sleep_avg <= 0) 　　　　prev->sleep_avg = 0;
　　prev->timestamp = prev->last_ran = now;

　　sched_info_switch(prev, next);
　　if (likely(prev != next)) {
　　　　next->timestamp = now;
　　　　rq->nr_switches++;
　　　　rq->curr = next;
　　　　++*switch_count;

　　　　prepare_arch_switch(rq, next);
　　　　prev = context_switch(rq, prev, next);
　　　　barrier();

　　　　finish_task_switch(prev);
　　} else
　　　　spin_unlock_irq(&rq->lock);

　　prev = current;
　　if (unlikely(reacquire_kernel_lock(prev) < 0))
　　　　goto need_resched_nonpreemptible;
　　preempt_enable_no_resched();
　　if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
　　　　goto need_resched;
}

訂閱：文章 (Atom)

Parrot's Note Book

2011年8月7日星期日

Process Scheduling - 4

2011年6月11日星期六

Process Scheduling - 3

2011年5月15日星期日

Process Scheduling - 2

2011年5月5日星期四

Process Scheduling - 1

關於我自己

標籤

網誌存檔

Parrot's Note Book

2011年8月7日 星期日

Process Scheduling - 4

2011年6月11日 星期六

Process Scheduling - 3

2011年5月15日 星期日

Process Scheduling - 2

2011年5月5日 星期四

Process Scheduling - 1

關於我自己

標籤

網誌存檔

2011年8月7日星期日

2011年6月11日星期六

2011年5月15日星期日

2011年5月5日星期四