2011年6月11日 星期六

Process Scheduling - 3

• wait queue

相對於 run queue 的是 wait queue,這是一個存放休眠中的行程的佇列。
由型態為 wait_queue_t 的雙向鏈結串列所組成,並具有一個型態為 wait_queue_head_t 的串列首。


wait_queue_t 結構中的 flags 值為 1 時,進入休眠的行程是 exclusive process,表示只有該行程在等待特定的系統資源。
若 flags 值為 0 時,則是 nonexclusive process,表示有多個行程在等待相同的系統資源。

而 func 欄位會在初始化時設定成 default_wake_function(),這是 exclusive process 的預設喚醒函式。


wait queue head 可以靜態的宣告:

DECLARE_WAIT_QUEUE_HEAD( wait_head );

或是動態的宣告:

wait_queue_head_t wait_head;

init_waitqueue_head( &wait_head );

要讓行程休眠可以呼叫下列巨集函式,將該行程加入一個等待佇列中,並交出 CPU 使用權。

wait_event(wait_head, condition)

wait_event_interruptible(wait_head, condition)

wait_event_timeout(wait_head, condition, timeout)

wait_event_interruptible_timeout(wait_head, condition, timeout)

要喚醒休眠中的行程可以呼叫下列巨集函式,被喚醒的行程會回到執行佇列之中。

wake_up( wait_head_pt )

wake_up_interruptible( wait_head_pt )


定義於 include/linux/wait.h

typedef struct __wait_queue wait_queue_t;
typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key);
int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);

struct __wait_queue {
  unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  struct task_struct * task;
  wait_queue_func_t func;
  struct list_head task_list;
};

struct wait_bit_key {
  void *flags;
  int bit_nr;
};

struct wait_bit_queue {
  struct wait_bit_key key;
  wait_queue_t wait;
};

struct __wait_queue_head {
  spinlock_t lock;
  struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

#define __wait_event(wq, condition)               \
do {                              \
  DEFINE_WAIT(__wait);                   \
                                \
  for (;;) {                          \
    prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \
    if (condition)                       \
      break;                       \
    schedule();                       \
  }                             \
  finish_wait(&wq, &__wait);                  \
} while (0)

#define wait_event(wq, condition)                 \
do {                              \
  if (condition)                        \
    break;                         \
  __wait_event(wq, condition);                 \
} while (0)


void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key));

#define wake_up(x)          __wake_up(x, TASK_UNINTERRUPTIBLE TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible(x)  __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)


定義於 kernel/wait.c

/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
* wake-function that tests for the wait-queue being active
* will be guaranteed to see waitqueue addition _or_ subsequent
* tests in this thread will see the wakeup having taken place.
*
* The spin_unlock() itself is semi-permeable and only protects
* one way (it only protects stuff inside the critical region and
* stops them from bleeding out - it would still allow subsequent
* loads to move into the the critical region).
*/
void fastcall
prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
  unsigned long flags;

  wait->flags &= ~WQ_FLAG_EXCLUSIVE;
  spin_lock_irqsave(&q->lock, flags);
  if (list_empty(&wait->task_list))
    __add_wait_queue(q, wait);
  /*
   * don't alter the task state if this is just going to
   * queue an async wait queue callback
   */
  if (is_sync_wait(wait))
    set_current_state(state);
  spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_wait);


定義於 kernel/sched.c

/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
* number) then we wake all the non-exclusive tasks and one exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
               int nr_exclusive, int sync, void *key)
{
  struct list_head *tmp, *next;

  list_for_each_safe(tmp, next, &q->task_list) {
    wait_queue_t *curr;
    unsigned flags;
    curr = list_entry(tmp, wait_queue_t, task_list);
    flags = curr->flags;
    if (curr->func(curr, mode, sync, key) &&
     (flags & WQ_FLAG_EXCLUSIVE) &&
     !--nr_exclusive)
      break;
  }
}

/**
* __wake_up - wake up threads blocked on a waitqueue.
* @q: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
            int nr_exclusive, void *key)
{
  unsigned long flags;

  spin_lock_irqsave(&q->lock, flags);
  __wake_up_common(q, mode, nr_exclusive, 0, key);
  spin_unlock_irqrestore(&q->lock, flags);
}

EXPORT_SYMBOL(__wake_up);

/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
* @state: the mask of task states that can be woken
* @sync: do a synchronous wakeup?
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
* re-schedule is in progress), and as such you're allowed to do
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
* returns failure only if the task is already active.
*/
static int try_to_wake_up(task_t *p, unsigned int state, int sync)
{
  int cpu, this_cpu, success = 0;
  unsigned long flags;
  long old_state;
  runqueue_t *rq;
#ifdef CONFIG_SMP
  unsigned long load, this_load;
  struct sched_domain *sd;
  int new_cpu;
#endif

  rq = task_rq_lock(p, &flags);
  schedstat_inc(rq, ttwu_cnt);
  old_state = p->state;
  if (!(old_state & state))
    goto out;

  if (p->array)
    goto out_running;

  cpu = task_cpu(p);
  this_cpu = smp_processor_id();

#ifdef CONFIG_SMP
  if (unlikely(task_running(rq, p)))
    goto out_activate;

  new_cpu = cpu;

  if (cpu == this_cpu unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
    goto out_set_cpu;

  load = source_load(cpu);
  this_load = target_load(this_cpu);

  /*
   * If sync wakeup then subtract the (maximum possible) effect of
   * the currently running task from the load of the current CPU:
   */
  if (sync)
    this_load -= SCHED_LOAD_SCALE;

  /* Don't pull the task off an idle CPU to a busy one */
  if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
    goto out_set_cpu;

  new_cpu = this_cpu; /* Wake to this CPU if we can */

  /*
   * Scan domains for affine wakeup and passive balancing
   * possibilities.
   */
  for_each_domain(this_cpu, sd) {
    unsigned int imbalance;
    /*
     * Start passive balancing when half the imbalance_pct
     * limit is reached.
     */
    imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;

    if ((sd->flags & SD_WAKE_AFFINE) &&
        !task_hot(p, rq->timestamp_last_tick, sd)) {
      /*
       * This domain has SD_WAKE_AFFINE and p is cache cold
       * in this domain.
       */
      if (cpu_isset(cpu, sd->span)) {
        schedstat_inc(sd, ttwu_wake_affine);
        goto out_set_cpu;
      }
    } else if ((sd->flags & SD_WAKE_BALANCE) &&
        imbalance*this_load <= 100*load) {
      /*
       * This domain has SD_WAKE_BALANCE and there is
       * an imbalance.
       */
      if (cpu_isset(cpu, sd->span)) {
        schedstat_inc(sd, ttwu_wake_balance);
        goto out_set_cpu;
      }
    }
  }

  new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
out_set_cpu:
  schedstat_inc(rq, ttwu_attempts);
  new_cpu = wake_idle(new_cpu, p);
  if (new_cpu != cpu) {
    schedstat_inc(rq, ttwu_moved);
    set_task_cpu(p, new_cpu);
    task_rq_unlock(rq, &flags);
    /* might preempt at this point */
    rq = task_rq_lock(p, &flags);
    old_state = p->state;
    if (!(old_state & state))
      goto out;
    if (p->array)
      goto out_running;

    this_cpu = smp_processor_id();
    cpu = task_cpu(p);
  }

out_activate:
#endif /* CONFIG_SMP */
  if (old_state == TASK_UNINTERRUPTIBLE) {
    rq->nr_uninterruptible--;
    /*
     * Tasks on involuntary sleep don't earn
     * sleep_avg beyond just interactive state.
     */
    p->activated = -1;
  }

  /*
   * Sync wakeups (i.e. those types of wakeups where the waker
   * has indicated that it will leave the CPU in short order)
   * don't trigger a preemption, if the woken up task will run on
   * this cpu. (in this case the 'I will reschedule' promise of
   * the waker guarantees that the freshly woken up task is going
   * to be considered on this CPU.)
   */
  activate_task(p, rq, cpu == this_cpu);
  if (!sync cpu != this_cpu) {
    if (TASK_PREEMPTS_CURR(p, rq))
      resched_task(rq->curr);
  }
  success = 1;

out_running:
  p->state = TASK_RUNNING;
out:
  task_rq_unlock(rq, &flags);

  return success;
}

int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
{
  task_t *p = curr->task;
  return try_to_wake_up(p, mode, sync);
}

EXPORT_SYMBOL(default_wake_function);