2011年5月15日 星期日

Process Scheduling - 2

• run queue

Linux 核心將所有可執行的行程(其狀態為 TASK_RUNNING)記錄於 run queue 之中,除了 swapper 行程。
若是多處理器的系統,則每一個 CPU 會有自己的 run queue 變數。

定義於 kernel/sched.c

struct prio_array {
  unsigned int nr_active;
  unsigned long bitmap[BITMAP_SIZE];
  struct list_head queue[MAX_PRIO];
};


struct runqueue {
  spinlock_t lock;
  
  /*
   * nr_running and cpu_load should be in the same cacheline because
   * remote CPUs use both these fields when doing load calculation.
   */
  unsigned long nr_running;
#ifdef CONFIG_SMP
  unsigned long cpu_load;
#endif
  unsigned long long nr_switches;


  /*
   * This is part of a global counter where only the total sum
   * over all CPUs matters. A task can increase this counter on
   * one CPU and if it got migrated afterwards it may decrease
   * it on another CPU. Always updated under the runqueue lock:
   */
  unsigned long nr_uninterruptible;
  
  unsigned long expired_timestamp;
  unsigned long long timestamp_last_tick;
  task_t *curr, *idle;
  struct mm_struct *prev_mm;
  prio_array_t *active, *expired, arrays[2];
  int best_expired_prio;
  atomic_t nr_iowait;


#ifdef CONFIG_SMP
  struct sched_domain *sd;
  
  /* For active balancing */
  int active_balance;
  int push_cpu;
  
  task_t *migration_thread;
  struct list_head migration_queue;
#endif



#ifdef CONFIG_SCHEDSTATS
  /* latency stats */
  struct sched_info rq_sched_info;
  
  /* sys_sched_yield() stats */
  unsigned long yld_exp_empty;
  unsigned long yld_act_empty;
  unsigned long yld_both_empty;
  unsigned long yld_cnt;
  
  /* schedule() stats */
  unsigned long sched_noswitch;
  unsigned long sched_switch;
  unsigned long sched_cnt;
  unsigned long sched_goidle;
  
  /* pull_task() stats */
  unsigned long pt_gained[MAX_IDLE_TYPES];
  unsigned long pt_lost[MAX_IDLE_TYPES];
  
  /* active_load_balance() stats */
  unsigned long alb_cnt;
  unsigned long alb_lost;
  unsigned long alb_gained;
  unsigned long alb_failed;
  
  /* try_to_wake_up() stats */
  unsigned long ttwu_cnt;
  unsigned long ttwu_attempts;
  unsigned long ttwu_moved;
  
  /* wake_up_new_task() stats */
  unsigned long wunt_cnt;
  unsigned long wunt_moved;
  
  /* sched_migrate_task() stats */
  unsigned long smt_cnt;
  
  /* sched_balance_exec() stats */
  unsigned long sbe_cnt;
#endif
};


runqueue 結構中名稱為 arrays 的欄位,是具有 2 個 prio_array_t 型別(即 struct prio_array)的陣列,
arrays[0] 與 arrays[1] 都包含 140 個雙向鏈結串列的串列首,用來記錄優先等級 0 到 139 的可執行的行程。
另外有 2 個指標會分別指向 arrays[0] 與 arrays[1],代表 active 與 expired 的行程集合,如下圖所示。

在系統的運作過程中,active 行程用完其執行時間就變成 expired 行程。而當 active 的佇列為空時,
expired 的佇列就會與 active 的佇列互換,便是將這 2 個指標指向不同的 arrays 的位置來達成。

2011年5月5日 星期四

Process Scheduling - 1

Linux 核心使用 time-shared 的排程方法,並且由 Timer Interrupt 觸發。
每當計時器中斷發生時,核心會呼叫 scheduler_tick() 來重新分配 CPU 的使用權,
因此所有的行程是以分時多工的方式被執行,計時器的中斷時間間隔稱為時間片段(time-slice)。

執行程序在 Linux 中被分成兩類:

Active - 尚未用完時間片段的行程
Expired - 已經用完時間片段的行程

可執行的行程會被存放在這兩種佇列之中,而核心的排程器是從 Active 佇列的最高優先等級的程序
選出一個行程來執行,此動作為呼叫 schedule()。當行程的時間片段用完時,便會被移至 Expired 佇列。

執行佇列的優先權有 140 個級別,數字愈小表示其優先次序愈高,
0 到 99 為 real-time 行程的等級,100 到 139 為一般行程的等級(即 User mode 程式)。


上圖是一個核心排程的例子,每次 scheduler_tick() 被呼叫時,核心會取得 CPU 的使用權,
並將目前正在執行的行程的 time-slice 減一,當行程的 time-slice 用完時,核心呼叫 schedule() 挑選另一個行程來執行。
若是執行中的行程因等待某些事件而主動交出 CPU 使用權時,也會經由呼叫 schedule() 切換執行權至另一個行程。


定義於 kernel/sched.c

/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*
* It also gets called by the fork code, when changing the parent's
* timeslices.
*/
void scheduler_tick(void)
{
  int cpu = smp_processor_id();
  runqueue_t *rq = this_rq();
  task_t *p = current;

  rq->timestamp_last_tick = sched_clock();

  if (p == rq->idle) {
    if (wake_priority_sleeper(rq))
      goto out;
    rebalance_tick(cpu, rq, SCHED_IDLE);
    return;
  }

  /* Task might have expired already, but not scheduled off yet */
  if (p->array != rq->active) {
    set_tsk_need_resched(p);
    goto out;
  }
  spin_lock(&rq->lock);
  /*
   * The task was running during this tick - update the
   * time slice counter. Note: we do not update a thread's
   * priority until it either goes to sleep or uses up its
   * timeslice. This makes it possible for interactive tasks
   * to use up their timeslices at their highest priority levels.
   */
  if (rt_task(p)) {
    /*
     * RR tasks need a special form of timeslice management.
     * FIFO tasks have no timeslices.
     */
    if ((p->policy == SCHED_RR) && !--p->time_slice) {
      p->time_slice = task_timeslice(p);
      p->first_time_slice = 0;
      set_tsk_need_resched(p);

      /* put it at the end of the queue: */
      requeue_task(p, rq->active);
    }
    goto out_unlock;
  }
  if (!--p->time_slice) {
    dequeue_task(p, rq->active);
    set_tsk_need_resched(p);
    p->prio = effective_prio(p);
    p->time_slice = task_timeslice(p);
    p->first_time_slice = 0;

    if (!rq->expired_timestamp)
      rq->expired_timestamp = jiffies;
    if (!TASK_INTERACTIVE(p) EXPIRED_STARVING(rq)) {
      enqueue_task(p, rq->expired);
      if (p->static_prio < rq->best_expired_prio)
        rq->best_expired_prio = p->static_prio;
    } else
      enqueue_task(p, rq->active);
  } else {
    /*
     * Prevent a too long timeslice allowing a task to monopolize
     * the CPU. We do this by splitting up the timeslice into
     * smaller pieces.
     *
     * Note: this does not mean the task's timeslices expire or
     * get lost in any way, they just might be preempted by
     * another task of equal priority. (one with higher
     * priority would have preempted this task already.) We
     * requeue this task to the end of the list on this priority
     * level, which is in essence a round-robin of tasks with
     * equal priority.
     *
     * This only applies to tasks in the interactive
     * delta range with at least TIMESLICE_GRANULARITY to requeue.
     */
    if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
      p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
      (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
      (p->array == rq->active)) {

      requeue_task(p, rq->active);
      set_tsk_need_resched(p);
    }
  }
out_unlock:
  spin_unlock(&rq->lock);
out:
  rebalance_tick(cpu, rq, NOT_IDLE);
}

/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
{
  long *switch_count;
  task_t *prev, *next;
  runqueue_t *rq;
  prio_array_t *array;
  struct list_head *queue;
  unsigned long long now;
  unsigned long run_time;
  int cpu, idx;

  /*
   * Test if we are atomic. Since do_exit() needs to call into
   * schedule() atomically, we ignore that path for now.
   * Otherwise, whine if we are scheduling when we should not be.
   */
  if (likely(!current->exit_state)) {
    if (unlikely(in_atomic())) {
      printk(KERN_ERR "scheduling while atomic: "
        "%s/0x%08x/%d\n",
        current->comm, preempt_count(), current->pid);
      dump_stack();
    }
  }
  profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
  preempt_disable();
  prev = current;
  release_kernel_lock(prev);
need_resched_nonpreemptible:
  rq = this_rq();

  /*
   * The idle thread is not allowed to schedule!
   * Remove this check after it has been exercised a bit.
   */
  if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
    printk(KERN_ERR "bad: scheduling from the idle thread!\n");
    dump_stack();
  }

  schedstat_inc(rq, sched_cnt);
  now = sched_clock();
  if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))     run_time = now - prev->timestamp;
  else
    run_time = NS_MAX_SLEEP_AVG;

  /*
   * Tasks charged proportionately less run_time at high sleep_avg to
   * delay them losing their interactive status
   */
  run_time /= (CURRENT_BONUS(prev) ? : 1);

  spin_lock_irq(&rq->lock);

  if (unlikely(prev->flags & PF_DEAD))
    prev->state = EXIT_DEAD;

  switch_count = &prev->nivcsw;
  if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
    switch_count = &prev->nvcsw;
    if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
        unlikely(signal_pending(prev))))
      prev->state = TASK_RUNNING;
    else {
      if (prev->state == TASK_UNINTERRUPTIBLE)
        rq->nr_uninterruptible++;
      deactivate_task(prev, rq);
    }
  }

  cpu = smp_processor_id();
  if (unlikely(!rq->nr_running)) {
go_idle:
    idle_balance(cpu, rq);
    if (!rq->nr_running) {
      next = rq->idle;
      rq->expired_timestamp = 0;
      wake_sleeping_dependent(cpu, rq);
      /*
       * wake_sleeping_dependent() might have released
       * the runqueue, so break out if we got new
       * tasks meanwhile:
       */
      if (!rq->nr_running)
        goto switch_tasks;
    }
  } else {
    if (dependent_sleeper(cpu, rq)) {
      next = rq->idle;
      goto switch_tasks;
    }
    /*
     * dependent_sleeper() releases and reacquires the runqueue
     * lock, hence go into the idle loop if the rq went
     * empty meanwhile:
     */
    if (unlikely(!rq->nr_running))
      goto go_idle;
  }

  array = rq->active;
  if (unlikely(!array->nr_active)) {
    /*
     * Switch the active and expired arrays.
     */
    schedstat_inc(rq, sched_switch);
    rq->active = rq->expired;
    rq->expired = array;
    array = rq->active;
    rq->expired_timestamp = 0;
    rq->best_expired_prio = MAX_PRIO;
  } else
    schedstat_inc(rq, sched_noswitch);

  idx = sched_find_first_bit(array->bitmap);
  queue = array->queue + idx;
  next = list_entry(queue->next, task_t, run_list);

  if (!rt_task(next) && next->activated > 0) {
    unsigned long long delta = now - next->timestamp;

    if (next->activated == 1)
      delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

    array = next->array;
    dequeue_task(next, array);
    recalc_task_prio(next, next->timestamp + delta);
    enqueue_task(next, array);
  }
  next->activated = 0;
switch_tasks:
  if (next == rq->idle)
    schedstat_inc(rq, sched_goidle);
  prefetch(next);
  clear_tsk_need_resched(prev);
  rcu_qsctr_inc(task_cpu(prev));

  prev->sleep_avg -= run_time;
  if ((long)prev->sleep_avg <= 0)     prev->sleep_avg = 0;
  prev->timestamp = prev->last_ran = now;

  sched_info_switch(prev, next);
  if (likely(prev != next)) {
    next->timestamp = now;
    rq->nr_switches++;
    rq->curr = next;
    ++*switch_count;

    prepare_arch_switch(rq, next);
    prev = context_switch(rq, prev, next);
    barrier();

    finish_task_switch(prev);
  } else
    spin_unlock_irq(&rq->lock);

  prev = current;
  if (unlikely(reacquire_kernel_lock(prev) < 0))
    goto need_resched_nonpreemptible;
  preempt_enable_no_resched();
  if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
    goto need_resched;
}