- unsigned long expired_timestamp; /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm;++ DECLARE_BITMAP(dyn_bitmap, MAX_DYN_PRIO + 1);+ /*+ * The bitmap of priorities queued; The extra PRIO_RANGE at the end+ * is for a bitmap of expired tasks queued. This minimises the number+ * of bit lookups over prio_array swaps. The dynamic bits can have+ * false positives. Include 1 bit for delimiter.+ */++ DECLARE_BITMAP(static_bitmap, MAX_PRIO);+ /* The bitmap of all static priorities queued */++ unsigned long prio_queued[MAX_PRIO];+ /* The number of tasks at each static priority */++ long prio_quota[PRIO_RANGE];+ /*+ * The quota of ticks the runqueue runs at each dynamic priority+ * before cycling to the next priority.+ */+ struct prio_array *active, *expired, arrays[2];- int best_expired_prio;++ int prio_level;+ /* The current dynamic priority level this runqueue is at */++ unsigned long prio_rotation;+ /* How many times we have rotated the priority queue */+ atomic_t nr_iowait;

#ifdef CONFIG_SMP@@ -569,12 +500,9 @@ static inline struct rq *this_rq_lock(vo #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given- * the cpu. We should note that with the exception of interactive- * tasks, the expired queue will become the active queue after the active- * queue is empty, without explicitly dequeuing and requeuing tasks in the- * expired queue. (Interactive tasks may be requeued directly to the- * active queue, thus delaying tasks in the expired queue from running;- * see scheduler_tick()).+ * the cpu. We should note that the expired queue will become the active+ * queue after the active queue is empty, without explicitly dequeuing and+ * requeuing tasks in the expired queue. * * This function is only called from sched_info_arrive(), rather than * dequeue_task(). Even though a task may be queued and dequeued multiple@@ -672,71 +600,167 @@ sched_info_switch(struct task_struct *pr #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */

/*- * Put task to the end of the run list without the overhead of dequeue- * followed by enqueue.+ * Removing from a runqueue. While we don't know with absolute certainty+ * where this task really is, the p->array and p->prio are very likely+ * so we check that queue to see if we can clear that bit to take some+ * load off finding false positives in next_dynamic_task(). */-static void requeue_task(struct task_struct *p, struct prio_array *array)+static void dequeue_task(struct task_struct *p, struct rq *rq) {- list_move_tail(&p->run_list, array->queue + p->prio);+ list_del_init(&p->run_list);+ if (!--rq->prio_queued[p->static_prio])+ __clear_bit(p->static_prio, rq->static_bitmap);+ if (list_empty(p->array->queue + p->prio)) {+ int bitmap_prio = p->prio;++ if (p->array == rq->expired)+ bitmap_prio += PRIO_RANGE;+ __clear_bit(bitmap_prio, rq->dyn_bitmap);+ } }

-static inline void-enqueue_task_head(struct task_struct *p, struct prio_array *array)+/*+ * The task is being queued on a fresh array so it has its entitlement+ * bitmap cleared.+ */+static inline void task_new_array(struct task_struct *p, struct rq *rq)+{+ bitmap_zero(p->bitmap, PRIO_RANGE);+ p->rotation = rq->prio_rotation;+}++#define rq_quota(rq, prio) ((rq)->prio_quota[USER_PRIO(prio)])+/*+ * recalc_task_prio determines what prio a non rt_task will be+ * queued at. If the task has already been running during this runqueue's+ * major rotation (rq->prio_rotation) then it continues at the same+ * priority if it has tick entitlement left. If it does not have entitlement+ * left, it finds the next priority slot according to its nice value that it+ * has not extracted quota from. If it has not run during this major+ * rotation, it starts at its static priority and has its bitmap quota+ * cleared. If it does not have any slots left it has all its slots reset and+ * is queued on the expired at its static priority.+ */+static void recalc_task_prio(struct task_struct *p, struct rq *rq) {- list_add(&p->run_list, array->queue + p->prio);- __set_bit(p->prio, array->bitmap);- array->nr_active++;+ struct prio_array *array = rq->active;+ int queue_prio, search_prio;++ if (p->rotation == rq->prio_rotation && p->array == array) {+ if (p->time_slice && rq_quota(rq, p->prio))+ return;+ } else+ task_new_array(p, rq);+ search_prio = p->static_prio;++ /*+ * SCHED_BATCH tasks never start at better priority than any other+ * task that is already running since they are flagged as latency+ * insensitive. This means they never cause greater latencies in other+ * non SCHED_BATCH tasks of the same nice level.+ */+ if (unlikely(p->policy == SCHED_BATCH))+ search_prio = max(p->static_prio, rq->prio_level);+ queue_prio = SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,+ USER_PRIO(search_prio)));+ if (queue_prio == MAX_PRIO) {+ queue_prio = p->static_prio;+ array = rq->expired;+ bitmap_zero(p->bitmap, PRIO_RANGE);+ } else+ rq_quota(rq, queue_prio) += p->quota;+ p->prio = p->normal_prio = queue_prio; p->array = array;+ set_task_entitlement(p); }

/*- * __normal_prio - return the priority that is based on the static- * priority but is modified by bonuses/penalties.- *- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]- * into the -5 ... 0 ... +5 bonus/penalty range.- *- * We use 25% of the full 0...39 priority range so that:- *- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.- *- * Both properties are important to certain workloads.+ * Adding to a runqueue. The dynamic priority queue that it is added to is+ * determined by the priority rotation of the runqueue it is being added to+ * and the quota still available in the task in p->bitmap and p->time_slice+ * (see recalc_task_prio above). The rq static_bitmap stores a list of+ * the static priorities, and prio_queued the number of tasks stored at each+ * p->static_prio level. */+static inline void __enqueue_task(struct task_struct *p, struct rq *rq)+{+ if (rt_task(p))+ p->array = rq->active;+ else+ recalc_task_prio(p, rq);+ rq->prio_queued[p->static_prio]++;

/* * Calculate the current priority, i.e. the priority * taken into account by the scheduler. This value might- * be boosted by RT tasks, or might be boosted by- * interactivity modifiers. Will be RT if the task got+ * be boosted by RT tasks as it will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. */ static int effective_prio(struct task_struct *p)@@ -842,111 +886,26 @@ static int effective_prio(struct task_st }

- p->prio = recalc_task_prio(p, now);-- /*- * This checks to make sure it's not an uninterruptible task- * that is now waking up.- */- if (p->sleep_type == SLEEP_NORMAL) {- /*- * Tasks which were woken up by interrupts (ie. hw events)- * are most likely of interactive nature. So we give them- * the credit of extending their sleep time to the period- * of time they spend on the runqueue, waiting for execution- * on a CPU, first time around:- */- if (in_interrupt())- p->sleep_type = SLEEP_INTERRUPTED;- else {- /*- * Normal first-time wakeups get a credit too for- * on-runqueue time, but it will be weighted down:- */- p->sleep_type = SLEEP_INTERACTIVE;- }- }+ p->quota = rr_interval(p);+ p->prio = effective_prio(p); p->timestamp = now;-out: __activate_task(p, rq); }

/*- * Tasks that have marked their sleep as noninteractive get- * woken up with their sleep average not weighted in an- * interactive way.- */- if (old_state & TASK_NONINTERACTIVE)- p->sleep_type = SLEEP_NONINTERACTIVE;--- activate_task(p, rq, cpu == this_cpu);- /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on@@ -1541,10 +1474,9 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */- if (!sync || cpu != this_cpu) {- if (TASK_PREEMPTS_CURR(p, rq))- resched_task(rq->curr);- }+ activate_task(p, rq, cpu == this_cpu);+ if (!sync || cpu != this_cpu)+ try_preempt(p, rq); success = 1;

/*- * We place interactive tasks back into the active array, if possible.- *- * To guarantee that this does not starve expired tasks we ignore the- * interactivity of a task if the first expired task had to wait more- * than a 'reasonable' amount of time. This deadline timeout is- * load-dependent, as the frequency of array switched decreases with- * increasing number of running tasks. We also ignore the interactivity- * if a better static_prio task has expired:- */-static inline int expired_starving(struct rq *rq)-{- if (rq->curr->static_prio > rq->best_expired_prio)- return 1;- if (!STARVATION_LIMIT || !rq->expired_timestamp)- return 0;- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)- return 1;- return 0;-}--/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count()@@ -3129,87 +3018,137 @@ void account_steal_time(struct task_stru cpustat->steal = cputime64_add(cpustat->steal, tmp); }

+/*+ * The task has used up its quota of running in this prio_level so it must be+ * dropped a priority level, all managed by recalc_task_prio().+ */+static void task_expired_entitlement(struct rq *rq, struct task_struct *p)+{+ struct prio_array *old_array;+ int old_prio;++ set_tsk_need_resched(p);+ if (unlikely(p->first_time_slice))+ p->first_time_slice = 0;+ if (rt_task(p)) {+ p->time_slice = p->quota;+ return;+ }+ old_array = p->array;+ old_prio = p->prio;+ /* p->prio and p->array will be updated in recalc_task_prio */+ recalc_task_prio(p, rq);+ requeue_task(p, rq, old_array, old_prio);+}++/*+ * A major priority rotation occurs when all priority quotas for this array+ * have been exhausted.+ */+static inline void major_prio_rotation(struct rq *rq)+{+ struct prio_array *new_array = rq->expired;++ rq->expired = rq->active;+ rq->active = new_array;+ rq->prio_rotation++;+ bitmap_zero(rq->dyn_bitmap, MAX_DYN_PRIO);+ bitmap_copy(rq->dyn_bitmap, rq->static_bitmap, MAX_PRIO);+ __set_bit(MAX_DYN_PRIO, rq->dyn_bitmap);+}++/*+ * This is the heart of the virtual deadline priority management.+ *+ * We have used up the quota allocated to this priority level so we rotate+ * the prio_level of the runqueue to the next lowest priority. We merge any+ * remaining tasks at this level current_queue with the next priority and+ * reset this level's queue. MAX_PRIO - 1 is a special case where we perform+ * a major rotation.+ */+static inline void rotate_runqueue_priority(struct rq *rq)+{+ int new_prio_level, remaining_quota = rq_quota(rq, rq->prio_level);+ struct prio_array *array = rq->active;++ if (rq->prio_level > MAX_PRIO - 2) {+ /* Major rotation required */+ struct prio_array *new_queue = rq->expired;++ /*+ * The static_bitmap gives us the highest p->static prio task+ * that is queued. This value is used as the prio after+ * the major rotation and all tasks remaining on this+ * active queue are moved there. This means tasks can end+ * up a p->prio better than their p->static_prio.+ */+ new_prio_level = find_next_bit(rq->static_bitmap, MAX_PRIO,+ MAX_RT_PRIO);+ if (!list_empty(array->queue + rq->prio_level)) {+ list_splice_tail_init(array->queue + rq->prio_level,+ new_queue->queue + new_prio_level);+ }+ memset(rq->prio_quota, 0, ARRAY_SIZE(rq->prio_quota));+ major_prio_rotation(rq);+ } else {+ /* Minor rotation */+ new_prio_level = rq->prio_level + 1;+ __clear_bit(rq->prio_level, rq->dyn_bitmap);+ if (!list_empty(array->queue + rq->prio_level)) {+ list_splice_tail_init(array->queue + rq->prio_level,+ array->queue + new_prio_level);+ __set_bit(new_prio_level, rq->dyn_bitmap);+ }+ rq_quota(rq, rq->prio_level) = 0;+ }+ rq->prio_level = new_prio_level;+ /*+ * While we usually rotate with the rq quota being 0, it is possible+ * to be negative so we subtract any deficit from the new level.+ */+ rq_quota(rq, new_prio_level) += remaining_quota;+}+ static void task_running_tick(struct rq *rq, struct task_struct *p) {- if (p->array != rq->active) {+ if (unlikely(!task_queued(p))) { /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); return; }+ /* SCHED_FIFO tasks never run out of timeslice. */+ if (unlikely(p->policy == SCHED_FIFO))+ return;+ spin_lock(&rq->lock); /*- * The task was running during this tick - update the- * time slice counter. Note: we do not update a thread's- * priority until it either goes to sleep or uses up its- * timeslice. This makes it possible for interactive tasks- * to use up their timeslices at their highest priority levels.+ * Accounting is performed by both the task and the runqueue. This+ * allows frequently sleeping tasks to get their proper quota of+ * cpu as the runqueue will have their quota still available at+ * the appropriate priority level. It also means frequently waking+ * tasks that might miss the scheduler_tick() will get forced down+ * priority regardless.+ */+ if (!--p->time_slice)+ task_expired_entitlement(rq, p);+ /*+ * The rq quota can become negative due to a task being queued in+ * scheduler without any quota left at that priority level. It is+ * cheaper to allow it to run till this scheduler tick and then+ * subtract it from the quota of the merged queues. */- if (rt_task(p)) {- /*- * RR tasks need a special form of timeslice management.- * FIFO tasks have no timeslices.- */- if ((p->policy == SCHED_RR) && !--p->time_slice) {- p->time_slice = task_timeslice(p);+ if (!rt_task(p) && --rq_quota(rq, rq->prio_level) <= 0) {+ if (unlikely(p->first_time_slice)) p->first_time_slice = 0;- set_tsk_need_resched(p);-- /* put it at the end of the queue: */- requeue_task(p, rq->active);- }- goto out_unlock;- }- if (!--p->time_slice) {- dequeue_task(p, rq->active);+ rotate_runqueue_priority(rq); set_tsk_need_resched(p);- p->prio = effective_prio(p);- p->time_slice = task_timeslice(p);- p->first_time_slice = 0;-- if (!rq->expired_timestamp)- rq->expired_timestamp = jiffies;- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {- enqueue_task(p, rq->expired);- if (p->static_prio < rq->best_expired_prio)- rq->best_expired_prio = p->static_prio;- } else- enqueue_task(p, rq->active);- } else {- /*- * Prevent a too long timeslice allowing a task to monopolize- * the CPU. We do this by splitting up the timeslice into- * smaller pieces.- *- * Note: this does not mean the task's timeslices expire or- * get lost in any way, they just might be preempted by- * another task of equal priority. (one with higher- * priority would have preempted this task already.) We- * requeue this task to the end of the list on this priority- * level, which is in essence a round-robin of tasks with- * equal priority.- *- * This only applies to tasks in the interactive- * delta range with at least TIMESLICE_GRANULARITY to requeue.- */- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -- p->time_slice) % TIMESLICE_GRANULARITY(p)) &&- (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&- (p->array == rq->active)) {-- requeue_task(p, rq->active);- set_tsk_need_resched(p);- } }-out_unlock: spin_unlock(&rq->lock); }

/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled.- *- * It also gets called by the fork code, when changing the parent's- * timeslices. */ void scheduler_tick(void) {@@ -3271,6 +3210,12 @@ static void wake_sleeping_dependent(int } }

unsigned long policy; cpumask_t cpus_allowed; unsigned int time_slice, first_time_slice;+ /*+ * How much this task is entitled to run at the current priority+ * before being requeued at a lower priority, and is this the very+ * first time_slice this task has ever run.+ */+ unsigned int quota;+ /*+ * How much this task contributes to the current priority queue+ * length+ */

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info;-- -ck-To unsubscribe from this list: send the line "unsubscribe linux-kernel" inthe body of a message to majordomo@vger.kernel.orgMore majordomo info at http://vger.kernel.org/majordomo-info.htmlPlease read the FAQ at http://www.tux.org/lkml/