Index: linux-2.6.22-ck1/include/linux/sched.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/sched.h 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/sched.h 2007-07-10 14:55:21.000000000 +1000 @@ -34,9 +34,14 @@ #define SCHED_FIFO 1 #define SCHED_RR 2 #define SCHED_BATCH 3 +#define SCHED_ISO 4 +#define SCHED_IDLEPRIO 5 #ifdef __KERNEL__ +#define SCHED_MAX SCHED_IDLEPRIO +#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) + struct sched_param { int sched_priority; }; @@ -129,7 +134,7 @@ extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); extern unsigned long weighted_cpuload(const int cpu); - +extern int above_background_load(void); /* * Task state bitmask. NOTE! These bits are also @@ -150,8 +155,7 @@ #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_DEAD 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -537,14 +541,19 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO +#define PRIO_RANGE (40) +#define ISO_PRIO (MAX_RT_PRIO - 1) -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) -#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) +#define rt_prio(prio) unlikely((prio) < ISO_PRIO) #define rt_task(p) rt_prio((p)->prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) +#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ + (policy) == SCHED_RR) #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) +#define iso_task(p) unlikely((p)->policy == SCHED_ISO) +#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) /* * Some day this will be a full-fledged user tracking system.. @@ -809,13 +818,6 @@ struct pipe_inode_info; struct uts_namespace; -enum sleep_type { - SLEEP_NORMAL, - SLEEP_NONINTERACTIVE, - SLEEP_INTERACTIVE, - SLEEP_INTERRUPTED, -}; - struct prio_array; struct task_struct { @@ -835,20 +837,33 @@ int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; struct list_head run_list; + /* + * This bitmap shows what priorities this task has received quota + * from for this major priority rotation on its current runqueue. + */ + DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); struct prio_array *array; + /* Which major runqueue rotation did this task run */ + unsigned long rotation; unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif - unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - enum sleep_type sleep_type; unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; + /* + * How much this task is entitled to run at the current priority + * before being requeued at a lower priority. + */ + int time_slice; + /* Is this the very first time_slice this task has ever run. */ + unsigned int first_time_slice; + /* How much this task receives at each priority level */ + int quota; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -1013,6 +1028,7 @@ struct held_lock held_locks[MAX_LOCK_DEPTH]; unsigned int lockdep_recursion; #endif + unsigned long mutexes_held; /* journalling filesystem info */ void *journal_info; @@ -1181,9 +1197,11 @@ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ +#define PF_NONSLEEP 0x80000000 /* Waiting on in-kernel activity */ /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1253,7 +1271,7 @@ #endif extern void set_user_nice(struct task_struct *p, long nice); -extern int task_prio(const struct task_struct *p); +extern int task_prio(struct task_struct *p); extern int task_nice(const struct task_struct *p); extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); Index: linux-2.6.22-ck1/kernel/sched.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/sched.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/sched.c 2007-07-10 14:55:24.000000000 +1000 @@ -16,6 +16,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas */ #include @@ -53,8 +54,9 @@ #include #include #include - +#include #include + #include /* @@ -84,147 +86,85 @@ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +/* Some helpers for converting to/from various scales.*/ #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) - -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. - */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ - INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) - -static unsigned int static_prio_timeslice(int static_prio) -{ - if (static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, static_prio); -} - -#ifdef CONFIG_SMP -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. +#define MS_TO_NS(TIME) ((TIME) * 1000000) +#define MS_TO_US(TIME) ((TIME) * 1000) +#define US_TO_MS(TIME) ((TIME) / 1000) + +#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) + +/* + * This is the time all tasks within the same priority round robin. + * Value is in ms and set to a minimum of 10ms. Scales with number of cpus. + * Tunable via /proc interface. + */ +int rr_interval __read_mostly = 6; +int sched_interactive __read_mostly = 1; + +/* + * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks + * are allowed to run (over ISO_PERIOD seconds) as real time tasks. + * sched_iso_period - sysctl which determines the number of seconds over + * which cpu usage of SCHED_ISO tasks is averaged to determine if they are + * exceeding their allowable bandwidth. +*/ +int sched_iso_cpu __read_mostly = 80; +int sched_iso_period __read_mostly = 5; + +#define ISO_PERIOD ((sched_iso_period * HZ) + 1) + +/* + * This contains a bitmap for each dynamic priority level with empty slots + * for the valid priorities each different nice level can have. It allows + * us to stagger the slots where differing priorities run in a way that + * keeps latency differences between different nice levels at a minimum. + * The purpose of a pre-generated matrix is for rapid lookup of next slot in + * O(1) time without having to recalculate every time priority gets demoted. + * All nice levels use priority slot 39 as this allows less niced tasks to + * get all priority slots better than that before expiration is forced. + * ie, where 0 means a slot for that priority, priority running from left to + * right is from prio 0 to prio 39: + * nice -20 0000000000000000000000000000000000000000 + * nice -10 1000100010001000100010001000100010010000 + * nice 0 1010101010101010101010101010101010101010 + * nice 5 1011010110110101101101011011010110110110 + * nice 10 1110111011101110111011101110111011101110 + * nice 15 1111111011111110111111101111111011111110 + * nice 19 1111111111111111111111111111111111111110 */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} +static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] + __read_mostly; -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif +struct rq; /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * These are the runqueue data structures: */ +struct prio_array { + /* Tasks queued at each priority */ + struct list_head queue[MAX_PRIO + 1]; -static inline unsigned int task_timeslice(struct task_struct *p) -{ - return static_prio_timeslice(p->static_prio); -} + /* + * The bitmap of priorities queued for this array. While the expired + * array will never have realtime tasks on it, it is simpler to have + * equal sized bitmaps for a cheap array swap. Include 1 bit for + * delimiter. + */ + DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); -/* - * These are the runqueue data structures: - */ + /* + * The best static priority (of the dynamic priority tasks) queued + * this array. + */ + int best_static_prio; -struct prio_array { - unsigned int nr_active; - DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_PRIO]; +#ifdef CONFIG_SMP + /* For convenience looks back at rq */ + struct rq *rq; +#endif }; /* @@ -260,14 +200,28 @@ */ unsigned long nr_uninterruptible; - unsigned long expired_timestamp; /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; - struct prio_array *active, *expired, arrays[2]; - int best_expired_prio; + + struct prio_array *active, *expired, *idleprio, arrays[2]; + unsigned long *dyn_bitmap, *exp_bitmap; + + /* + * The current dynamic priority level this runqueue is at per static + * priority level. + */ + int prio_level[PRIO_RANGE]; + + /* How many times we have rotated the priority queue */ + unsigned long prio_rotation; + unsigned long iso_ticks; + unsigned short iso_refractory; + + /* Number of idleprio tasks running */ + unsigned long nr_idleprio; atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -606,12 +560,9 @@ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). + * the cpu. We should note that the expired queue will become the active + * queue after the active queue is empty, without explicitly dequeuing and + * requeuing tasks in the expired queue. * * This function is only called from sched_info_arrive(), rather than * dequeue_task(). Even though a task may be queued and dequeued multiple @@ -709,71 +660,304 @@ #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +static int idleprio_suitable(struct task_struct *p) +{ + return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && + !(p->flags & (PF_NONSLEEP | PF_EXITING))); +} + +static int idleprio(const struct task_struct *p) +{ + return (p->prio == MAX_PRIO); +} + +static inline int task_queued(struct task_struct *task) +{ + return !list_empty(&task->run_list); +} + +static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) +{ + __set_bit(p->prio, p->array->prio_bitmap); +} + /* - * Adding/removing a task to/from a priority array: + * Removing from a runqueue. */ -static void dequeue_task(struct task_struct *p, struct prio_array *array) +static void dequeue_task(struct task_struct *p, struct rq *rq) { - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); + list_del_init(&p->run_list); + if (idleprio_task(p) && idleprio(p)) + rq->nr_idleprio--; + else if (list_empty(p->array->queue + p->prio)) + __clear_bit(p->prio, p->array->prio_bitmap); } -static void enqueue_task(struct task_struct *p, struct prio_array *array) +static void reset_first_time_slice(struct task_struct *p) { - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; + if (unlikely(p->first_time_slice)) + p->first_time_slice = 0; +} + +/* + * The task is being queued on a fresh array so it has its entitlement + * bitmap cleared. + */ +static void task_new_array(struct task_struct *p, struct rq *rq, + struct prio_array *array) +{ + bitmap_zero(p->bitmap, PRIO_RANGE); + p->rotation = rq->prio_rotation; + p->time_slice = p->quota; p->array = array; + reset_first_time_slice(p); +} + +/* Find the first slot from the relevant prio_matrix entry */ +static int first_prio_slot(struct task_struct *p) +{ + if (unlikely(p->policy == SCHED_BATCH)) + return p->static_prio; + return SCHED_PRIO(find_first_zero_bit( + prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); } /* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. + * In sched_interactive mode priority allocation occurs per process per rq + * array swap. In !sched_interactive mode all waking tasks must obey the + * current prio level of all other tasks running per array swap. */ -static void requeue_task(struct task_struct *p, struct prio_array *array) +static int minprio(struct rq *rq, int uprio) { - list_move_tail(&p->run_list, array->queue + p->prio); + if (sched_interactive) + return MAX_RT_PRIO; + return rq->prio_level[uprio]; } -static inline void -enqueue_task_head(struct task_struct *p, struct prio_array *array) +/* + * Find the first unused slot by this task that is also in its prio_matrix + * level. SCHED_BATCH tasks do not use the priority matrix. They only take + * priority slots from their static_prio and above. + */ +static int next_entitled_slot(struct task_struct *p, struct rq *rq) { - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; + int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); + struct prio_array *array = rq->active; + DECLARE_BITMAP(tmp, PRIO_RANGE); + + /* + * Go straight to expiration if there are higher priority tasks + * already expired. + */ + if (p->static_prio > rq->expired->best_static_prio) + return MAX_PRIO; + if (!rq->prio_level[uprio]) + rq->prio_level[uprio] = MAX_RT_PRIO; + /* + * Only priorities equal to the prio_level and above for their + * static_prio are acceptable, and only if it's not better than + * a queued better static_prio's prio_level. + */ + if (p->static_prio < array->best_static_prio) { + if (likely(p->policy != SCHED_BATCH)) + array->best_static_prio = p->static_prio; + } else if (p->static_prio == array->best_static_prio) { + search_prio = minprio(rq, uprio); + } else { + int i; + + search_prio = minprio(rq, uprio); + /* A bound O(n) function, worst case n is 40 */ + for (i = array->best_static_prio; i <= p->static_prio ; i++) { + if (!rq->prio_level[USER_PRIO(i)]) + rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; + search_prio = max(search_prio, + rq->prio_level[USER_PRIO(i)]); + } + } + if (unlikely(p->policy == SCHED_BATCH)) { + search_prio = max(search_prio, p->static_prio); + return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, + USER_PRIO(search_prio))); + } + bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); + return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, + USER_PRIO(search_prio))); +} + +static void queue_expired(struct task_struct *p, struct rq *rq) +{ + task_new_array(p, rq, rq->expired); + p->prio = p->normal_prio = first_prio_slot(p); + if (p->static_prio < rq->expired->best_static_prio) + rq->expired->best_static_prio = p->static_prio; + reset_first_time_slice(p); } +#ifdef CONFIG_SMP /* - * __normal_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. + * If we're waking up a task that was previously on a different runqueue, + * update its data appropriately. Note we may be reading data from src_rq-> + * outside of lock, but the occasional inaccurate result should be harmless. */ + static void update_if_moved(struct task_struct *p, struct rq *rq) +{ + struct rq *src_rq = p->array->rq; + + if (src_rq == rq) + return; + /* + * Only need to set p->array when p->rotation == rq->prio_rotation as + * they will be set in recalc_task_prio when != rq->prio_rotation. + */ + if (p->rotation == src_rq->prio_rotation) { + p->rotation = rq->prio_rotation; + if (p->array == src_rq->expired) + p->array = rq->expired; + else + p->array = rq->active; + } else + p->rotation = 0; +} +#else +static inline void update_if_moved(struct task_struct *p, struct rq *rq) +{ +} +#endif -static inline int __normal_prio(struct task_struct *p) +static inline int isoprio_suitable(struct task_struct *p) { - int bonus, prio; + return !(p->flags & PF_ISOREF); +} - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; +static int task_timeslice(struct task_struct *p); - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; +/* + * recalc_task_prio determines what priority a non rt_task will be + * queued at. If the task has already been running during this runqueue's + * major rotation (rq->prio_rotation) then it continues at the same + * priority if it has tick entitlement left. If it does not have entitlement + * left, it finds the next priority slot according to its nice value that it + * has not extracted quota from. If it has not run during this major + * rotation, it starts at the next_entitled_slot and has its bitmap quota + * cleared. If it does not have any slots left it has all its slots reset and + * is queued on the expired at its first_prio_slot. + */ +static void recalc_task_prio(struct task_struct *p, struct rq *rq) +{ + struct prio_array *array = rq->active; + int queue_prio; + + if (iso_task(p)) { + if (isoprio_suitable(p)) { + /* + * If SCHED_ISO tasks have not used up their real time + * quota they have run just better than highest + * SCHED_NORMAL priority. Otherwise they run as + * SCHED_NORMAL. + */ + p->prio = p->normal_prio = ISO_PRIO; + p->array = rq->active; + if (p->time_slice <= 0) + p->time_slice = p->quota; + return; + } else if (p->prio == ISO_PRIO) { + /* Just about to be demoted to SCHED_NORMAL */ + p->time_slice = 0; + } + } else if (idleprio_task(p)) { + if (idleprio_suitable(p)) { + /* + * If suitable idleprio_tasks are queued at MAX_PRIO + * only on the idleprio array. Their time_slice is + * their full task_timeslice as they cooperatively + * multitask. + */ + p->prio = p->normal_prio = MAX_PRIO; + p->array = rq->idleprio; + if (p->time_slice <= 0) + p->time_slice = task_timeslice(p); + return; + } + /* + * If unsuitable idleprio_tasks are queued equivalent to + * nice 19 tasks on the expired array. + */ + p->flags &= ~PF_NONSLEEP; + p->prio = p->normal_prio = MAX_PRIO - 1; + p->array = rq->expired; + if (p->time_slice <= 0 || p->time_slice > p->quota) + p->time_slice = p->quota; + return; + } + + update_if_moved(p, rq); + if (p->rotation == rq->prio_rotation) { + if (p->array == array) { + if (p->time_slice > 0) + return; + p->time_slice = p->quota; + } else if (p->array == rq->expired) { + queue_expired(p, rq); + return; + } else + task_new_array(p, rq, array); + } else + task_new_array(p, rq, array); + + queue_prio = next_entitled_slot(p, rq); + if (queue_prio >= MAX_PRIO) { + queue_expired(p, rq); + return; + } + p->prio = p->normal_prio = queue_prio; + __set_bit(USER_PRIO(p->prio), p->bitmap); +} + +/* + * Adding to a runqueue. The dynamic priority queue that it is added to is + * determined by recalc_task_prio() above. + */ +static inline void __enqueue_task(struct task_struct *p, struct rq *rq) +{ + if (rt_task(p)) + p->array = rq->active; + else + recalc_task_prio(p, rq); + + if (idleprio_task(p) && idleprio(p)) + rq->nr_idleprio++; + sched_info_queued(p); + set_dynamic_bit(p, rq); +} + +static void enqueue_task(struct task_struct *p, struct rq *rq) +{ + __enqueue_task(p, rq); + list_add_tail(&p->run_list, p->array->queue + p->prio); +} + +static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) +{ + __enqueue_task(p, rq); + list_add(&p->run_list, p->array->queue + p->prio); +} + +/* + * requeue_task is only called when p->static_prio does not change. p->prio + * can change with dynamic tasks. + */ +static void requeue_task(struct task_struct *p, struct rq *rq, + struct prio_array *old_array, int old_prio) +{ + if (p->array == rq->expired) + queue_expired(p, rq); + list_move_tail(&p->run_list, p->array->queue + p->prio); + if (!rt_task(p)) { + if (list_empty(old_array->queue + old_prio)) + __clear_bit(old_prio, old_array->prio_bitmap); + set_dynamic_bit(p, rq); + } } /* @@ -786,20 +970,29 @@ */ /* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define LOAD_WEIGHT(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - LOAD_WEIGHT(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + * task_timeslice - the total duration a task can run during one major + * rotation. Returns value in milliseconds as the smallest value can be 1. + */ +static int task_timeslice(struct task_struct *p) +{ + int slice = p->quota; /* quota is in us */ + + if (!rt_task(p)) + slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; + return US_TO_MS(slice); +} + +/* + * The load weight is basically the task_timeslice in ms. Realtime tasks are + * special cased to be proportionately larger than nice -20 by their + * rt_priority. The weight for rt tasks can only be arbitrary at best. + */ +#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) static void set_load_weight(struct task_struct *p) { + int load_weight; + if (has_rt_policy(p)) { #ifdef CONFIG_SMP if (p == task_rq(p)->migration_thread) @@ -808,12 +1001,19 @@ * Giving its load any weight will skew balancing * adversely. */ - p->load_weight = 0; + load_weight = 0; else #endif - p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); } else - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); + load_weight = task_timeslice(p); + /* + * idleprio tasks have much lower weight than SCHED_NORMAL tasks but + * still need to be weighted to allow balancing to occur. + */ + if (likely(!idleprio_task(p))) + load_weight *= PRIO_RANGE; + p->load_weight = load_weight; } static inline void @@ -841,28 +1041,38 @@ } /* - * Calculate the expected normal priority: i.e. priority - * without taking RT-inheritance into account. Might be - * boosted by interactivity modifiers. Changes upon fork, - * setprio syscalls, and whenever the interactivity - * estimator recalculates. + * __activate_task - move a task to the runqueue. */ -static inline int normal_prio(struct task_struct *p) +static inline void __activate_task(struct task_struct *p, struct rq *rq) { - int prio; + enqueue_task(p, rq); + inc_nr_running(p, rq); +} +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq); + inc_nr_running(p, rq); +} + +static inline int normal_prio(struct task_struct *p) +{ if (has_rt_policy(p)) - prio = MAX_RT_PRIO-1 - p->rt_priority; + return MAX_RT_PRIO-1 - p->rt_priority; + /* Other tasks all have normal_prio set in recalc_task_prio */ + if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) + return p->prio; else - prio = __normal_prio(p); - return prio; + return p->static_prio; } /* * Calculate the current priority, i.e. the priority * taken into account by the scheduler. This value might - * be boosted by RT tasks, or might be boosted by - * interactivity modifiers. Will be RT if the task got + * be boosted by RT tasks as it will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. */ static int effective_prio(struct task_struct *p) @@ -878,112 +1088,70 @@ return p->prio; } -/* - * __activate_task - move a task to the runqueue. - */ -static void __activate_task(struct task_struct *p, struct rq *rq) +static inline unsigned int nice_quota_ms(int nice) { - struct prio_array *target = rq->active; + unsigned int rr = rr_interval; - if (batch_task(p)) - target = rq->expired; - enqueue_task(p, target); - inc_nr_running(p, rq); + if (nice < -6) { + rr *= nice * nice; + rr /= 40; + } else if (nice > 0) + rr = rr / 2 ? : 1; + return rr; } +#define DEFAULT_WEIGHT (nice_quota_ms(0) * 20 * PRIO_RANGE) + /* - * __activate_idle_task - move idle task to the _front_ of runqueue. + * A runqueue laden with a single nice 0 task scores a weighted_cpuload of + * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a + * task of nice 0 or enough lower priority tasks to bring up the + * weighted_cpuload */ -static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +int above_background_load(void) { - enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); + unsigned long cpu; + + for_each_online_cpu(cpu) { + if (weighted_cpuload(cpu) >= DEFAULT_WEIGHT) + return 1; + } + return 0; } /* - * Recalculate p->normal_prio and p->prio after having slept, - * updating the sleep-average too: + * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. + * From nice 1 to 19 they are smaller than it only if they are at least one + * tick still. Below nice 0 they get progressively larger. + * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval + * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. + * Value returned is in microseconds. */ -static int recalc_task_prio(struct task_struct *p, unsigned long long now) +static inline unsigned int rr_quota(struct task_struct *p) { - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long sleep_time = now - p->timestamp; + unsigned int quota; - if (batch_task(p)) - sleep_time = 0; - - if (likely(sleep_time > 0)) { - /* - * This ceiling is set to the lowest priority that would allow - * a task to be reinserted into the active array on timeslice - * completion. - */ - unsigned long ceiling = INTERACTIVE_SLEEP(p); - - if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { - /* - * Prevents user tasks from achieving best priority - * with one single large enough sleep. - */ - p->sleep_avg = ceiling; - /* - * Using INTERACTIVE_SLEEP() as a ceiling places a - * nice(0) task 1ms sleep away from promotion, and - * gives it 700ms to round-robin with no chance of - * being demoted. This is more than generous, so - * mark this sleep as non-interactive to prevent the - * on-runqueue bonus logic from intervening should - * this task not receive cpu immediately. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else { - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= ceiling) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - ceiling) { - p->sleep_avg = ceiling; - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - } - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } + if (rt_task(p)) + quota = rr_interval; + else + quota = nice_quota_ms(TASK_NICE(p)); + return MS_TO_US(quota); +} - return effective_prio(p); +/* Every time we set the quota we need to set the load weight */ +static void set_quota(struct task_struct *p) +{ + p->quota = rr_quota(p); + set_load_weight(p); } /* * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) */ static void activate_task(struct task_struct *p, struct rq *rq, int local) { - unsigned long long now; - - if (rt_task(p)) - goto out; + unsigned long long now = sched_clock(); - now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -1004,32 +1172,9 @@ (now - p->timestamp) >> 20); } - p->prio = recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (p->sleep_type == SLEEP_NORMAL) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->sleep_type = SLEEP_INTERRUPTED; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->sleep_type = SLEEP_INTERACTIVE; - } - } + set_quota(p); + p->prio = effective_prio(p); p->timestamp = now; -out: __activate_task(p, rq); } @@ -1039,8 +1184,7 @@ static void deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); - dequeue_task(p, p->array); - p->array = NULL; + dequeue_task(p, rq); } /* @@ -1133,7 +1277,7 @@ * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!task_queued(p) && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -1159,7 +1303,6 @@ { unsigned long flags; struct rq *rq; - struct prio_array *array; int running; repeat: @@ -1192,7 +1335,6 @@ */ rq = task_rq_lock(p, &flags); running = task_running(rq, p); - array = p->array; task_rq_unlock(rq, &flags); /* @@ -1215,7 +1357,7 @@ * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(array)) { + if (unlikely(task_queued(p))) { yield(); goto repeat; } @@ -1294,6 +1436,25 @@ } /* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} + +/* * find_idlest_group finds and returns the least busy CPU group within the * domain. */ @@ -1490,6 +1651,31 @@ } #endif +/* + * We need to have a special definition for an idle runqueue when testing + * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as + * a realtime task in sched_idle_next. + */ +#ifdef CONFIG_HOTPLUG_CPU +#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) +#else +#define rq_idle(rq) ((rq)->curr == (rq)->idle) +#endif + +static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + return ((p->array == task_rq(p)->active && + TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); +} + +static inline void try_preempt(struct task_struct *p, struct rq *rq) +{ + if (task_preempts_curr(p, rq)) + resched_task(rq->curr); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -1521,7 +1707,7 @@ if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; cpu = task_cpu(p); @@ -1614,7 +1800,7 @@ old_state = p->state; if (!(old_state & state)) goto out; - if (p->array) + if (task_queued(p)) goto out_running; this_cpu = smp_processor_id(); @@ -1623,25 +1809,9 @@ out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->sleep_type = SLEEP_NONINTERACTIVE; - } else - - /* - * Tasks that have marked their sleep as noninteractive get - * woken up with their sleep average not weighted in an - * interactive way. - */ - if (old_state & TASK_NONINTERACTIVE) - p->sleep_type = SLEEP_NONINTERACTIVE; - - activate_task(p, rq, cpu == this_cpu); /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1650,15 +1820,22 @@ * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - if (!sync || cpu != this_cpu) { - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); - } + activate_task(p, rq, cpu == this_cpu); + if (!sync || cpu != this_cpu) + try_preempt(p, rq); success = 1; out_running: p->state = TASK_RUNNING; out: + /* + * Special case when freezing we need to reschedule idleprio tasks + * as SCHED_NORMAL or else they'll never freeze + */ + if (idleprio_task(p) && freezing(p) && idleprio(p)) { + dequeue_task(p, rq); + enqueue_task(p, rq); + } task_rq_unlock(rq, &flags); return success; @@ -1676,7 +1853,6 @@ return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1704,7 +1880,6 @@ p->prio = current->normal_prio; INIT_LIST_HEAD(&p->run_list); - p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -1716,30 +1891,31 @@ /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + if (unlikely(p->policy == SCHED_FIFO)) + goto out; /* * Share the timeslice between parent and child, thus the * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. */ local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { + if (current->time_slice > 0) { + current->time_slice /= 2; + if (current->time_slice) + p->time_slice = current->time_slice; + else + p->time_slice = 1; /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. */ - current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); - } + p->first_time_slice = 1; + } else + p->time_slice = 0; + + p->timestamp = sched_clock(); local_irq_enable(); +out: put_cpu(); } @@ -1761,38 +1937,16 @@ this_cpu = smp_processor_id(); cpu = task_cpu(p); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - - p->prio = effective_prio(p); - if (likely(cpu == this_cpu)) { + activate_task(p, rq, 1); if (!(clone_flags & CLONE_VM)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { - p->prio = current->prio; - p->normal_prio = current->normal_prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - inc_nr_running(p, rq); - } set_need_resched(); - } else - /* Run child last */ - __activate_task(p, rq); + } /* * We skip the following code due to cpu == this_cpu * @@ -1809,19 +1963,16 @@ */ p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + rq->most_recent_timestamp; - __activate_task(p, rq); - if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + activate_task(p, rq, 0); + try_preempt(p, rq); /* * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: + * parent runqueue to update the parent's ->flags: */ task_rq_unlock(rq, &flags); this_rq = task_rq_lock(current, &flags); } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); task_rq_unlock(this_rq, &flags); } @@ -1836,23 +1987,17 @@ */ void fastcall sched_exit(struct task_struct *p) { + struct task_struct *parent; unsigned long flags; struct rq *rq; - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); + parent = p->parent; + rq = task_rq_lock(parent, &flags); + if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { + parent->time_slice += p->time_slice; + if (unlikely(parent->time_slice > parent->quota)) + parent->time_slice = parent->quota; + } task_rq_unlock(rq, &flags); } @@ -2184,23 +2329,17 @@ * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct prio_array *src_array, - struct task_struct *p, struct rq *this_rq, - struct prio_array *this_array, int this_cpu) +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) { - dequeue_task(p, src_array); + dequeue_task(p, src_rq); dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); - enqueue_task(p, this_array); + enqueue_task(p, this_rq); p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + this_rq->most_recent_timestamp; - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ - if (TASK_PREEMPTS_CURR(p, this_rq)) - resched_task(this_rq->curr); + try_preempt(p, this_rq); } /* @@ -2243,7 +2382,16 @@ return 1; } -#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) +static inline int rq_best_prio(struct rq *rq) +{ + int best_prio, exp_prio; + + best_prio = sched_find_first_bit(rq->dyn_bitmap); + exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); + if (unlikely(best_prio > exp_prio)) + best_prio = exp_prio; + return best_prio; +} /* * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted @@ -2259,7 +2407,7 @@ { int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, best_prio_seen, skip_for_load; - struct prio_array *array, *dst_array; + struct prio_array *array; struct list_head *head, *curr; struct task_struct *tmp; long rem_load_move; @@ -2286,31 +2434,29 @@ * be cache-cold, thus switching CPUs has the least effect * on them. */ - if (busiest->expired->nr_active) { - array = busiest->expired; - dst_array = this_rq->expired; - } else { - array = busiest->active; - dst_array = this_rq->active; - } - + array = busiest->expired; new_array: - /* Start searching at priority 0: */ - idx = 0; + /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ + if (array == busiest->expired) + idx = MAX_RT_PRIO; + else + idx = 0; skip_bitmap: if (!idx) - idx = sched_find_first_bit(array->bitmap); + idx = sched_find_first_bit(array->prio_bitmap); else - idx = find_next_bit(array->bitmap, MAX_PRIO, idx); - if (idx >= MAX_PRIO) { - if (array == busiest->expired && busiest->active->nr_active) { + idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); + if (idx == MAX_PRIO) { + if (array == busiest->idleprio && busiest->nr_idleprio) + goto found_idleprio; + if (array == busiest->expired) { array = busiest->active; - dst_array = this_rq->active; goto new_array; } goto out; } +found_idleprio: head = array->queue + idx; curr = head->prev; skip_queue: @@ -2332,11 +2478,22 @@ best_prio_seen |= idx == best_prio; if (curr != head) goto skip_queue; + if (idx == MAX_PRIO) { + /* + * Occurs either when balancing idleprio tasks or + * there really are no more tasks to find. + */ + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out; + } idx++; goto skip_bitmap; } - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pull_task(busiest, tmp, this_rq, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2349,6 +2506,13 @@ this_best_prio = idx; if (curr != head) goto skip_queue; + if (idx == MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out; + } idx++; goto skip_bitmap; } @@ -3297,11 +3461,36 @@ /* * This is called on clock ticks and on context switches. * Bank in p->sched_time the ns elapsed since the last tick or switch. + * CPU scheduler quota accounting is also performed here in microseconds. + * The value returned from sched_clock() occasionally gives bogus values so + * some sanity checking is required. */ -static inline void -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) +static void +update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, + int tick) { - p->sched_time += now - p->last_ran; + long time_diff = now - p->last_ran; + + if (tick) { + /* + * Called from scheduler_tick() there should be less than two + * jiffies worth, and not negative/overflow. + */ + if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) + time_diff = JIFFIES_TO_NS(1); + } else { + /* + * Called from context_switch there should be less than one + * jiffy worth, and not negative/overflow. There should be + * some time banked here so use a nominal 1us. + */ + if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) + time_diff = 1000; + } + /* time_slice accounting is done in usecs to avoid overflow on 32bit */ + if (p != rq->idle && p->policy != SCHED_FIFO) + p->time_slice -= time_diff / 1000; + p->sched_time += time_diff; p->last_ran = rq->most_recent_timestamp = now; } @@ -3322,27 +3511,6 @@ } /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -static inline int expired_starving(struct rq *rq) -{ - if (rq->curr->static_prio > rq->best_expired_prio) - return 1; - if (!STARVATION_LIMIT || !rq->expired_timestamp) - return 0; - if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) - return 1; - return 0; -} - -/* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() @@ -3357,7 +3525,7 @@ /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (TASK_NICE(p) > 0 || idleprio_task(p)) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); @@ -3415,87 +3583,94 @@ cpustat->steal = cputime64_add(cpustat->steal, tmp); } -static void task_running_tick(struct rq *rq, struct task_struct *p) +/* + * The task has used up its quota of running in this prio_level so it must be + * dropped a priority level, all managed by recalc_task_prio(). + */ +static void task_expired_entitlement(struct rq *rq, struct task_struct *p) { - if (p->array != rq->active) { - /* Task has expired but was not scheduled yet */ - set_tsk_need_resched(p); + int overrun; + + reset_first_time_slice(p); + if (rt_task(p)) { + p->time_slice += p->quota; + list_move_tail(&p->run_list, p->array->queue + p->prio); return; } - spin_lock(&rq->lock); + overrun = p->time_slice; + dequeue_task(p, rq); + enqueue_task(p, rq); /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. + * Subtract any extra time this task ran over its time_slice; ie + * overrun will either be 0 or negative. */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); + p->time_slice += overrun; +} - /* put it at the end of the queue: */ - requeue_task(p, rq->active); - } - goto out_unlock; +/* + * Test if SCHED_ISO tasks have run longer than their alloted period as RT + * tasks and set the refractory flag if necessary. There is 10% hysteresis + * for unsetting the flag. + */ +static unsigned int test_ret_isorefractory(struct rq *rq) +{ + if (likely(!rq->iso_refractory)) { + if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) + rq->iso_refractory = 1; + } else { + if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) + rq->iso_refractory = 0; } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; + return rq->iso_refractory; +} - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { +/* No SCHED_ISO task was running so decrease rq->iso_ticks */ +static inline void no_iso_tick(struct rq *rq) +{ + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; +} - requeue_task(p, rq->active); - set_tsk_need_resched(p); - } +/* This manages tasks that have run out of timeslice during a scheduler_tick */ +static void task_running_tick(struct rq *rq, struct task_struct *p) +{ + /* + * If a SCHED_ISO task is running we increment the iso_ticks. In + * order to prevent SCHED_ISO tasks from causing starvation in the + * presence of true RT tasks we account those as iso_ticks as well. + */ + if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { + if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) + rq->iso_ticks += 100; + } else + no_iso_tick(rq); + + if (iso_task(p)) { + if (unlikely(test_ret_isorefractory(rq))) { + if (isoprio_suitable(p)) { + /* + * SCHED_ISO task is running as RT and limit + * has been hit. Set the PF_ISOREF flag and + * force it to reschedule as SCHED_NORMAL + * by zeroing its time_slice + */ + p->flags |= PF_ISOREF; + p->time_slice = 0; + } + } else + p->flags &= ~PF_ISOREF; } -out_unlock: - spin_unlock(&rq->lock); + /* SCHED_FIFO tasks never run out of timeslice. */ + if (p->time_slice > 0 || p->policy == SCHED_FIFO) + return; + /* p->time_slice <= 0 */ + set_tsk_need_resched(p); + if (likely(task_queued(p))) + task_expired_entitlement(rq, p); } /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. */ void scheduler_tick(void) { @@ -3505,10 +3680,14 @@ int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); - update_cpu_clock(p, rq, now); + update_cpu_clock(p, rq, now, 1); + spin_lock(&rq->lock); if (!idle_at_tick) task_running_tick(rq, p); + else + no_iso_tick(rq); + spin_unlock(&rq->lock); #ifdef CONFIG_SMP update_load(rq); rq->idle_at_tick = idle_at_tick; @@ -3554,10 +3733,80 @@ #endif -static inline int interactive_sleep(enum sleep_type sleep_type) +static void reset_prio_levels(struct rq *rq) { - return (sleep_type == SLEEP_INTERACTIVE || - sleep_type == SLEEP_INTERRUPTED); + rq->active->best_static_prio = MAX_PRIO - 1; + rq->expired->best_static_prio = MAX_PRIO - 1; + memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); +} + +/* + * Only tasks running are SCHED_IDLEPRIO. Set the active array to the + * idleprio array and if it isn't already active + */ +static struct task_struct *next_idleprio_task(struct rq *rq) +{ + struct prio_array *array = rq->active; + struct list_head *queue; + + if (array != rq->idleprio) { + rq->active = rq->idleprio; + rq->expired = array; + array = rq->active; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; + } + rq->prio_rotation++; + reset_prio_levels(rq); + queue = array->queue + MAX_PRIO; + return list_entry(queue->next, struct task_struct, run_list); +} + +/* + * next_dynamic_task finds the next suitable dynamic task. + */ +static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) +{ + struct prio_array *array = rq->active; + struct task_struct *next; + struct list_head *queue; + int nstatic; + +retry: + if (unlikely(rq->nr_running == rq->nr_idleprio)) + return next_idleprio_task(rq); + if (idx >= MAX_PRIO) { + /* There are no more tasks in the active array. Swap arrays */ + array = rq->expired; + rq->expired = rq->active; + rq->active = array; + rq->exp_bitmap = rq->expired->prio_bitmap; + rq->dyn_bitmap = rq->active->prio_bitmap; + rq->prio_rotation++; + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); + reset_prio_levels(rq); + } + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + if (unlikely(next->time_slice <= 0 && !(iso_task(next) && + isoprio_suitable(next)))) { + /* + * Unlucky enough that this task ran out of time_slice + * before it hit a scheduler_tick so it should have its + * priority reassessed and choose another task (possibly + * the same one) + */ + task_expired_entitlement(rq, next); + idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); + goto retry; + } + next->rotation = rq->prio_rotation; + nstatic = next->static_prio; + if (nstatic < array->best_static_prio) + array->best_static_prio = nstatic; + if (idx > rq->prio_level[USER_PRIO(nstatic)]) + rq->prio_level[USER_PRIO(nstatic)] = idx; + return next; } /* @@ -3566,13 +3815,11 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - struct prio_array *array; struct list_head *queue; unsigned long long now; - unsigned long run_time; - int cpu, idx, new_prio; long *switch_count; struct rq *rq; + int cpu, idx; /* * Test if we are atomic. Since do_exit() needs to call into @@ -3608,18 +3855,6 @@ schedstat_inc(rq, sched_cnt); now = sched_clock(); - if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { - run_time = now - prev->timestamp; - if (unlikely((long long)(now - prev->timestamp) < 0)) - run_time = 0; - } else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks charged proportionately less run_time at high sleep_avg to - * delay them losing their interactive status - */ - run_time /= (CURRENT_BONUS(prev) ? : 1); spin_lock_irq(&rq->lock); @@ -3630,8 +3865,10 @@ unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { - if (prev->state == TASK_UNINTERRUPTIBLE) + if (prev->state == TASK_UNINTERRUPTIBLE) { + prev->flags |= PF_NONSLEEP; rq->nr_uninterruptible++; + } deactivate_task(prev, rq); } } @@ -3641,59 +3878,29 @@ idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; - rq->expired_timestamp = 0; goto switch_tasks; } } - array = rq->active; - if (unlikely(!array->nr_active)) { - /* - * Switch the active and expired arrays. - */ - schedstat_inc(rq, sched_switch); - rq->active = rq->expired; - rq->expired = array; - array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; - } - - idx = sched_find_first_bit(array->bitmap); - queue = array->queue + idx; - next = list_entry(queue->next, struct task_struct, run_list); - - if (!rt_task(next) && interactive_sleep(next->sleep_type)) { - unsigned long long delta = now - next->timestamp; - if (unlikely((long long)(now - next->timestamp) < 0)) - delta = 0; - - if (next->sleep_type == SLEEP_INTERACTIVE) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - new_prio = recalc_task_prio(next, next->timestamp + delta); - - if (unlikely(next->prio != new_prio)) { - dequeue_task(next, array); - next->prio = new_prio; - enqueue_task(next, array); - } + idx = sched_find_first_bit(rq->dyn_bitmap); + if (likely(idx > ISO_PRIO)) + next = next_dynamic_task(rq, idx); + else { + queue = rq->active->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); } - next->sleep_type = SLEEP_NORMAL; switch_tasks: - if (next == rq->idle) + if (next == rq->idle) { + reset_prio_levels(rq); + rq->prio_rotation++; schedstat_inc(rq, sched_goidle); + } prefetch(next); prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - update_cpu_clock(prev, rq, now); - - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) - prev->sleep_avg = 0; + update_cpu_clock(prev, rq, now, 0); prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); @@ -4129,29 +4336,22 @@ */ void rt_mutex_setprio(struct task_struct *p, int prio) { - struct prio_array *array; unsigned long flags; + int queued, oldprio; struct rq *rq; - int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); oldprio = p->prio; - array = p->array; - if (array) - dequeue_task(p, array); + queued = task_queued(p); + if (queued) + dequeue_task(p, rq); p->prio = prio; - if (array) { - /* - * If changing to an RT priority then queue it - * in the active array! - */ - if (rt_task(p)) - array = rq->active; - enqueue_task(p, array); + if (queued) { + enqueue_task(p, rq); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -4160,8 +4360,8 @@ if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else + try_preempt(p, rq); } task_rq_unlock(rq, &flags); } @@ -4170,8 +4370,7 @@ void set_user_nice(struct task_struct *p, long nice) { - struct prio_array *array; - int old_prio, delta; + int queued, old_prio,delta; unsigned long flags; struct rq *rq; @@ -4192,26 +4391,27 @@ p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - array = p->array; - if (array) { - dequeue_task(p, array); + queued = task_queued(p); + if (queued) { + dequeue_task(p, rq); dec_raw_weighted_load(rq, p); } p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); old_prio = p->prio; p->prio = effective_prio(p); + set_quota(p); delta = p->prio - old_prio; - if (array) { - enqueue_task(p, array); + if (queued) { + enqueue_task(p, rq); inc_raw_weighted_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) + if (delta < 0 || ((delta > 0 || idleprio_task(p)) && + task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -4281,11 +4481,23 @@ * * This is the priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. + * around 1, value goes from 0 to +79. Values higher than + * 39 indicate task is on the expired array. This is done + * lockless and may rarely return an active instead of + * expired value. */ -int task_prio(const struct task_struct *p) +int task_prio(struct task_struct *p) { - return p->prio - MAX_RT_PRIO; + int prio = p->prio - MAX_RT_PRIO; + + if (task_queued(p)) { + struct rq *rq = task_rq(p); + struct prio_array *array = p->array; + + if (rq && rq->expired == array) + prio += PRIO_RANGE; + } + return prio; } /** @@ -4328,19 +4540,14 @@ /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { - BUG_ON(p->array); + BUG_ON(task_queued(p)); p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - set_load_weight(p); + set_quota(p); } /** @@ -4354,19 +4561,36 @@ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) { - int retval, oldprio, oldpolicy = -1; - struct prio_array *array; + struct sched_param zero_param = { .sched_priority = 0 }; + int queued, retval, oldprio, oldpolicy = -1; + unsigned long rlim_rtprio = 0; unsigned long flags; struct rq *rq; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); + if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { + unsigned long lflags; + + if (!lock_task_sighand(p, &lflags)) + return -ESRCH; + rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + unlock_task_sighand(p, &lflags); + if (rlim_rtprio) + goto recheck; + /* + * If the caller requested an RT policy without having the + * necessary rights, we downgrade the policy to SCHED_ISO. + * We also set the parameter to zero to pass the checks. + */ + policy = SCHED_ISO; + param = &zero_param; + } recheck: /* double check policy once rq lock held */ if (policy < 0) policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH) + else if (!SCHED_RANGE(policy)) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are @@ -4385,14 +4609,6 @@ */ if (!capable(CAP_SYS_NICE)) { if (is_rt_policy(policy)) { - unsigned long rlim_rtprio; - unsigned long flags; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; - unlock_task_sighand(p, &flags); - /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) return -EPERM; @@ -4401,6 +4617,31 @@ if (param->sched_priority > p->rt_priority && param->sched_priority > rlim_rtprio) return -EPERM; + } else { + switch (p->policy) { + /* + * Can only downgrade policies but not back to + * SCHED_NORMAL + */ + case SCHED_ISO: + if (policy == SCHED_ISO) + goto out; + if (policy == SCHED_NORMAL) + return -EPERM; + break; + case SCHED_BATCH: + if (policy == SCHED_BATCH) + goto out; + if (policy != SCHED_IDLEPRIO) + return -EPERM; + break; + case SCHED_IDLEPRIO: + if (policy == SCHED_IDLEPRIO) + goto out; + return -EPERM; + default: + break; + } } /* can't change other user's priorities */ @@ -4409,6 +4650,11 @@ return -EPERM; } + if (!(p->mm) && policy == SCHED_IDLEPRIO) { + /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ + return -EINVAL; + } + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4429,12 +4675,12 @@ spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - array = p->array; - if (array) + queued = task_queued(p); + if (queued) deactivate_task(p, rq); oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); - if (array) { + if (queued) { __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and @@ -4444,14 +4690,15 @@ if (task_running(rq, p)) { if (p->prio > oldprio) resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) - resched_task(rq->curr); + } else + try_preempt(p, rq); } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); rt_mutex_adjust_pi(p); +out: return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -4718,41 +4965,34 @@ * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU by moving the calling thread - * to the expired array. If there are no other threads running on this - * CPU then this function will return. + * to the expired array if SCHED_NORMAL or the end of its current priority + * queue if a realtime task. If there are no other threads running on this + * cpu this function will return. */ asmlinkage long sys_sched_yield(void) { struct rq *rq = this_rq_lock(); - struct prio_array *array = current->array, *target = rq->expired; + struct task_struct *p = current; schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; - - if (array->nr_active == 1) { - schedstat_inc(rq, yld_act_empty); - if (!rq->expired->nr_active) - schedstat_inc(rq, yld_both_empty); - } else if (!rq->expired->nr_active) - schedstat_inc(rq, yld_exp_empty); - - if (array != target) { - dequeue_task(current, array); - enqueue_task(current, target); - } else - /* - * requeue_task is cheaper so perform that if possible. - */ - requeue_task(current, array); + if (rq->nr_running == 1) + schedstat_inc(rq, yld_both_empty); + else { + struct prio_array *old_array = p->array; + int old_prio = p->prio; + + if (idleprio_task(p)) { + dequeue_task(p, rq); + enqueue_task(p, rq); + goto out_release; + } + /* p->prio will be updated in requeue_task via queue_expired */ + if (!rt_task(p)) + p->array = rq->expired; + requeue_task(p, rq, old_array, old_prio); + } +out_release: /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: @@ -4902,6 +5142,8 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLEPRIO: ret = 0; break; } @@ -4926,6 +5168,8 @@ break; case SCHED_NORMAL: case SCHED_BATCH: + case SCHED_ISO: + case SCHED_IDLEPRIO: ret = 0; } return ret; @@ -4959,8 +5203,8 @@ if (retval) goto out_unlock; - jiffies_to_timespec(p->policy == SCHED_FIFO ? - 0 : task_timeslice(p), &t); + t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : + MS_TO_NS(task_timeslice(p))); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -5056,10 +5300,10 @@ struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); - idle->sleep_avg = 0; - idle->array = NULL; - idle->prio = idle->normal_prio = MAX_PRIO; + bitmap_zero(idle->bitmap, PRIO_RANGE); + idle->timestamp = idle->last_ran = sched_clock(); + idle->array = rq->active; + idle->prio = idle->normal_prio = NICE_TO_PRIO(0); idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -5178,7 +5422,7 @@ goto out; set_task_cpu(p, dest_cpu); - if (p->array) { + if (task_queued(p)) { /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step @@ -5189,8 +5433,7 @@ + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); - if (TASK_PREEMPTS_CURR(p, rq_dest)) - resched_task(rq_dest->curr); + try_preempt(p, rq_dest); } ret = 1; out: @@ -5487,7 +5730,7 @@ /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); deactivate_task(rq->idle, rq); - rq->idle->static_prio = MAX_PRIO; + rq->idle->static_prio = NICE_TO_PRIO(0); __setscheduler(rq->idle, SCHED_NORMAL, 0); migrate_dead_tasks(cpu); task_rq_unlock(rq, &flags); @@ -7013,6 +7256,13 @@ /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); + + /* + * Assume that every added cpu gives us slightly less overall latency + * allowing us to increase the base rr_interval, but in a non linear + * fashion. + */ + rr_interval *= 1 + ilog2(num_online_cpus()); } #else void __init sched_init_smp(void) @@ -7035,6 +7285,16 @@ int i, j, k; int highest_cpu = 0; + /* Generate the priority matrix */ + for (i = 0; i < PRIO_RANGE; i++) { + bitmap_fill(prio_matrix[i], PRIO_RANGE); + j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); + for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { + __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), + prio_matrix[i]); + } + } + for_each_possible_cpu(i) { struct prio_array *array; struct rq *rq; @@ -7042,12 +7302,20 @@ rq = cpu_rq(i); spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); + rq->iso_ticks = 0; rq->nr_running = 0; + rq->nr_idleprio = 0; + rq->prio_rotation = 0; rq->active = rq->arrays; + rq->idleprio = rq->active; rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; + reset_prio_levels(rq); + rq->dyn_bitmap = rq->active->prio_bitmap; + rq->exp_bitmap = rq->expired->prio_bitmap; #ifdef CONFIG_SMP + rq->active->rq = rq; + rq->expired->rq = rq; rq->sd = NULL; for (j = 1; j < 3; j++) rq->cpu_load[j] = 0; @@ -7060,17 +7328,16 @@ atomic_set(&rq->nr_iowait, 0); for (j = 0; j < 2; j++) { + array = rq->arrays + j; - for (k = 0; k < MAX_PRIO; k++) { + for (k = 0; k <= MAX_PRIO; k++) INIT_LIST_HEAD(array->queue + k); - __clear_bit(k, array->bitmap); - } - // delimiter for bitsearch - __set_bit(MAX_PRIO, array->bitmap); + bitmap_zero(array->prio_bitmap, MAX_PRIO); + /* delimiter for bitsearch */ + __set_bit(MAX_PRIO, array->prio_bitmap); } highest_cpu = i; } - set_load_weight(&init_task); #ifdef CONFIG_SMP @@ -7125,25 +7392,25 @@ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) { - struct prio_array *array; struct task_struct *g, *p; unsigned long flags; struct rq *rq; + int queued; read_lock_irq(&tasklist_lock); do_each_thread(g, p) { - if (!rt_task(p)) + if (!rt_task(p) && !iso_task(p)) continue; spin_lock_irqsave(&p->pi_lock, flags); rq = __task_rq_lock(p); - array = p->array; - if (array) + queued = task_queued(p); + if (queued) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); - if (array) { + if (queued) { __activate_task(p, task_rq(p)); resched_task(rq->curr); } Index: linux-2.6.22-ck1/kernel/sysctl.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/sysctl.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/sysctl.c 2007-07-10 14:55:23.000000000 +1000 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ extern char core_pattern[]; extern int pid_max; extern int min_free_kbytes; +extern int vm_tail_largefiles; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; @@ -78,6 +80,10 @@ extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int rr_interval; +extern int sched_interactive; +extern int sched_iso_cpu; +extern int sched_iso_period; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -161,6 +167,14 @@ #endif +/* Constants for minimum and maximum testing. + We use these as one-element integer vectors. */ +static int __read_mostly zero; +static int __read_mostly one = 1; +static int __read_mostly one_hundred = 100; +static int __read_mostly five_thousand = 5000; + + /* The default sysctl tables: */ static ctl_table root_table[] = { @@ -501,6 +515,47 @@ .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "rr_interval", + .data = &rr_interval, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &five_thousand, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "interactive", + .data = &sched_interactive, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "iso_cpu", + .data = &sched_iso_cpu, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "iso_period", + .data = &sched_iso_period, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &one_hundred, + }, #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .ctl_name = KERN_UNKNOWN_NMI_PANIC, @@ -619,14 +674,16 @@ { .ctl_name = 0 } }; -/* Constants for minimum and maximum testing in vm_table. - We use these as one-element integer vectors. */ -static int zero; -static int one_hundred = 100; - - static ctl_table vm_table[] = { { + .ctl_name = CTL_UNNUMBERED, + .procname = "tail_largefiles", + .data = &vm_tail_largefiles, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = VM_OVERCOMMIT_MEMORY, .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, @@ -705,16 +762,24 @@ .proc_handler = &proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .ctl_name = CTL_UNNUMBERED, + .procname = "mapped", + .data = &vm_mapped, + .maxlen = sizeof(vm_mapped), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hardmaplimit", + .data = &vm_hardmaplimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, @@ -882,6 +947,32 @@ .extra1 = &zero, }, #endif +#ifdef CONFIG_SWAP_PREFETCH + { + .ctl_name = CTL_UNNUMBERED, + .procname = "swap_prefetch", + .data = &swap_prefetch, + .maxlen = sizeof(swap_prefetch), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "swap_prefetch_delay", + .data = &swap_prefetch_delay, + .maxlen = sizeof(swap_prefetch_delay), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "swap_prefetch_sleep", + .data = &swap_prefetch_sleep, + .maxlen = sizeof(swap_prefetch_sleep), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; Index: linux-2.6.22-ck1/Documentation/sched-design.txt =================================================================== --- linux-2.6.22-ck1.orig/Documentation/sched-design.txt 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/Documentation/sched-design.txt 2007-07-10 14:55:02.000000000 +1000 @@ -1,11 +1,14 @@ - Goals, Design and Implementation of the - new ultra-scalable O(1) scheduler + Goals, Design and Implementation of the ultra-scalable O(1) scheduler by + Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by + Con Kolivas. - This is an edited version of an email Ingo Molnar sent to - lkml on 4 Jan 2002. It describes the goals, design, and - implementation of Ingo's new ultra-scalable O(1) scheduler. - Last Updated: 18 April 2002. + This was originally an edited version of an email Ingo Molnar sent to + lkml on 4 Jan 2002. It describes the goals, design, and implementation + of Ingo's ultra-scalable O(1) scheduler. It now contains a description + of the Staircase Deadline priority scheduler that was built on this + design. + Last Updated: Fri, 4 May 2007 Goal @@ -163,3 +166,222 @@ code is smaller than the old one. Ingo + + +Staircase Deadline cpu scheduler policy +================================================ + +Design summary +============== + +A novel design which incorporates a foreground-background descending priority +system (the staircase) via a bandwidth allocation matrix according to nice +level. + + +Features +======== + +A starvation free, strict fairness O(1) scalable design with interactivity +as good as the above restrictions can provide. There is no interactivity +estimator, no sleep/run measurements and only simple fixed accounting. +The design has strict enough a design and accounting that task behaviour +can be modelled and maximum scheduling latencies can be predicted by +the virtual deadline mechanism that manages runqueues. The prime concern +in this design is to maintain fairness at all costs determined by nice level, +yet to maintain as good interactivity as can be allowed within the +constraints of strict fairness. + + +Design description +================== + +SD works off the principle of providing each task a quota of runtime that it is +allowed to run at a number of priority levels determined by its static priority +(ie. its nice level). If the task uses up its quota it has its priority +decremented to the next level determined by a priority matrix. Once every +runtime quota has been consumed of every priority level, a task is queued on the +"expired" array. When no other tasks exist with quota, the expired array is +activated and fresh quotas are handed out. This is all done in O(1). + +Design details +============== + +Each task keeps a record of its own entitlement of cpu time. Most of the rest of +these details apply to non-realtime tasks as rt task management is straight +forward. + +Each runqueue keeps a record of what major epoch it is up to in the +rq->prio_rotation field which is incremented on each major epoch. It also +keeps a record of the current prio_level for each static priority task. + +Each task keeps a record of what major runqueue epoch it was last running +on in p->rotation. It also keeps a record of what priority levels it has +already been allocated quota from during this epoch in a bitmap p->bitmap. + +The only tunable that determines all other details is the RR_INTERVAL. This +is set to 8ms, and is scaled gently upwards with more cpus. This value is +tunable via a /proc interface. + +All tasks are initially given a quota based on RR_INTERVAL. This is equal to +RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and +progressively larger for nice values from -1 to -20. This is assigned to +p->quota and only changes with changes in nice level. + +As a task is first queued, it checks in recalc_task_prio to see if it has run at +this runqueue's current priority rotation. If it has not, it will have its +p->prio level set according to the first slot in a "priority matrix" and will be +given a p->time_slice equal to the p->quota, and has its allocation bitmap bit +set in p->bitmap for this prio level. It is then queued on the current active +priority array. + +If a task has already been running during this major epoch, and it has +p->time_slice left and the rq->prio_quota for the task's p->prio still +has quota, it will be placed back on the active array, but no more quota +will be added. + +If a task has been running during this major epoch, but does not have +p->time_slice left, it will find the next lowest priority in its bitmap that it +has not been allocated quota from. It then gets the a full quota in +p->time_slice. It is then queued on the current active priority array at the +newly determined lower priority. + +If a task has been running during this major epoch, and does not have +any entitlement left in p->bitmap and no time_slice left, it will have its +bitmap cleared, and be queued at its best prio again, but on the expired +priority array. + +When a task is queued, it has its relevant bit set in the array->prio_bitmap. + +p->time_slice is stored in nanosconds and is updated via update_cpu_clock on +schedule() and scheduler_tick. If p->time_slice is below zero then the +recalc_task_prio is readjusted and the task rescheduled. + + +Priority Matrix +=============== + +In order to minimise the latencies between tasks of different nice levels +running concurrently, the dynamic priority slots where different nice levels +are queued are dithered instead of being sequential. What this means is that +there are 40 priority slots where a task may run during one major rotation, +and the allocation of slots is dependant on nice level. In the +following table, a zero represents a slot where the task may run. + +PRIORITY:0..................20.................39 +nice -20 0000000000000000000000000000000000000000 +nice -10 1000100010001000100010001000100010010000 +nice 0 1010101010101010101010101010101010101010 +nice 5 1011010110110101101101011011010110110110 +nice 10 1110111011101110111011101110111011101110 +nice 15 1111111011111110111111101111111011111110 +nice 19 1111111111111111111111111111111111111110 + +As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 +task only runs one slot per major rotation. This dithered table allows for the +smallest possible maximum latencies between tasks of varying nice levels, thus +allowing vastly different nice levels to be used. + +SCHED_BATCH tasks are managed slightly differently, receiving only the top +slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but +slightly higher latencies. + + +Modelling deadline behaviour +============================ + +As the accounting in this design is hard and not modified by sleep average +calculations or interactivity modifiers, it is possible to accurately +predict the maximum latency that a task may experience under different +conditions. This is a virtual deadline mechanism enforced by mandatory +timeslice expiration and not outside bandwidth measurement. + +The maximum duration a task can run during one major epoch is determined by its +nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL +duration during each epoch. Nice 10 tasks can run at 9 priority levels for each +epoch, and so on. The table in the priority matrix above demonstrates how this +is enforced. + +Therefore the maximum duration a runqueue epoch can take is determined by +the number of tasks running, and their nice level. After that, the maximum +duration it can take before a task can wait before it get scheduled is +determined by the position of its first slot on the matrix. + +In the following examples, these are _worst case scenarios_ and would rarely +occur, but can be modelled nonetheless to determine the maximum possible +latency. + +So for example, if two nice 0 tasks are running, and one has just expired as +another is activated for the first time receiving a full quota for this +runqueue rotation, the first task will wait: + +nr_tasks * max_duration + nice_difference * rr_interval +1 * 19 * RR_INTERVAL + 0 = 152ms + +In the presence of a nice 10 task, a nice 0 task would wait a maximum of +1 * 10 * RR_INTERVAL + 0 = 80ms + +In the presence of a nice 0 task, a nice 10 task would wait a maximum of +1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms + +More useful than these values, though, are the average latencies which are +a matter of determining the average distance between priority slots of +different nice values and multiplying them by the tasks' quota. For example +in the presence of a nice -10 task, a nice 0 task will wait either one or +two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, +this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or +20 and 40ms respectively (on uniprocessor at 1000HZ). + + +Achieving interactivity +======================= + +A requirement of this scheduler design was to achieve good interactivity +despite being a completely fair deadline based design. The disadvantage of +designs that try to achieve interactivity is that they usually do so at +the expense of maintaining fairness. As cpu speeds increase, the requirement +for some sort of metered unfairness towards interactive tasks becomes a less +desirable phenomenon, but low latency and fairness remains mandatory to +good interactive performance. + +This design relies on the fact that interactive tasks, by their nature, +sleep often. Most fair scheduling designs end up penalising such tasks +indirectly giving them less than their fair possible share because of the +sleep, and have to use a mechanism of bonusing their priority to offset +this based on the duration they sleep. This becomes increasingly inaccurate +as the number of running tasks rises and more tasks spend time waiting on +runqueues rather than sleeping, and it is impossible to tell whether the +task that's waiting on a runqueue only intends to run for a short period and +then sleep again after than runqueue wait. Furthermore, all such designs rely +on a period of time to pass to accumulate some form of statistic on the task +before deciding on how much to give them preference. The shorter this period, +the more rapidly bursts of cpu ruin the interactive tasks behaviour. The +longer this period, the longer it takes for interactive tasks to get low +scheduling latencies and fair cpu. + +This design does not measure sleep time at all. Interactive tasks that sleep +often will wake up having consumed very little if any of their quota for +the current major priority rotation. The longer they have slept, the less +likely they are to even be on the current major priority rotation. Once +woken up, though, they get to use up a their full quota for that epoch, +whether part of a quota remains or a full quota. Overall, however, they +can still only run as much cpu time for that epoch as any other task of the +same nice level. This means that two tasks behaving completely differently +from fully cpu bound to waking/sleeping extremely frequently will still +get the same quota of cpu, but the latter will be using its quota for that +epoch in bursts rather than continuously. This guarantees that interactive +tasks get the same amount of cpu as cpu bound ones. + +The other requirement of interactive tasks is also to obtain low latencies +for when they are scheduled. Unlike fully cpu bound tasks and the maximum +latencies possible described in the modelling deadline behaviour section +above, tasks that sleep will wake up with quota available usually at the +current runqueue's priority_level or better. This means that the most latency +they are likely to see is one RR_INTERVAL, and often they will preempt the +current task if it is not of a sleeping nature. This then guarantees very +low latency for interactive tasks, and the lowest latencies for the least +cpu bound tasks. + + +Fri, 4 May 2007 +Con Kolivas Index: linux-2.6.22-ck1/Documentation/sysctl/kernel.txt =================================================================== --- linux-2.6.22-ck1.orig/Documentation/sysctl/kernel.txt 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/Documentation/sysctl/kernel.txt 2007-07-10 14:55:20.000000000 +1000 @@ -25,6 +25,9 @@ - domainname - hostname - hotplug +- interactive +- iso_cpu +- iso_period - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] - kstack_depth_to_print [ X86 only ] @@ -43,6 +46,7 @@ - printk - real-root-dev ==> Documentation/initrd.txt - reboot-cmd [ SPARC only ] +- rr_interval - rtsig-max - rtsig-nr - sem @@ -164,6 +168,40 @@ ============================================================== +interactive: + +The staircase-deadline cpu scheduler can be set in either purely +forward-looking mode for absolutely rigid fairness and cpu distribution +according to nice level, or it can allow a small per-process history +to smooth out cpu usage perturbations common in interactive tasks by +enabling this sysctl. While small fairness issues can arise with this +enabled, overall fairness is usually still strongly maintained and +starvation is never possible. Enabling this can significantly smooth +out 3d graphics and games. + +Default value is 1 (enabled). + +============================================================== + +iso_cpu: + +This sets the percentage cpu that the unprivileged SCHED_ISO tasks can +run effectively at realtime priority, averaged over a rolling iso_period +seconds. + +Set to 80 (percent) by default. + +============================================================== + +iso_period: + +This sets the number of seconds over which SCHED_ISO cpu usage is averaged +to see if it exceeds its allocated cpu bandwidth. + +Set to 5 (seconds) by default. + +============================================================== + l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If @@ -288,6 +326,19 @@ ============================================================== +rr_interval: + +This is the smallest duration that any cpu process scheduling unit +will run for. Increasing this value can increase throughput of cpu +bound tasks substantially but at the expense of increased latencies +overall. This value is in milliseconds and the default value chosen +depends on the number of cpus available at scheduler initialisation +with a minimum of 8. + +Valid values are from 1-5000. + +============================================================== + rtsig-max & rtsig-nr: The file rtsig-max can be used to tune the maximum number Index: linux-2.6.22-ck1/fs/pipe.c =================================================================== --- linux-2.6.22-ck1.orig/fs/pipe.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/fs/pipe.c 2007-07-10 14:55:02.000000000 +1000 @@ -41,12 +41,7 @@ { DEFINE_WAIT(wait); - /* - * Pipes are system-local resources, so sleeping on them - * is considered a noninteractive wait: - */ - prepare_to_wait(&pipe->wait, &wait, - TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); Index: linux-2.6.22-ck1/fs/proc/array.c =================================================================== --- linux-2.6.22-ck1.orig/fs/proc/array.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/fs/proc/array.c 2007-07-10 14:55:02.000000000 +1000 @@ -165,7 +165,6 @@ rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +172,6 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, Index: linux-2.6.22-ck1/include/linux/init_task.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/init_task.h 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/init_task.h 2007-07-10 14:55:20.000000000 +1000 @@ -125,13 +125,15 @@ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ + .rotation = 0, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ + .time_slice = 1000000000, \ + .quota = 1000000000, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -158,6 +160,7 @@ .signal = {{0}}}, \ .blocked = {{0}}, \ .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ + .mutexes_held = 0, \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ Index: linux-2.6.22-ck1/kernel/softirq.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/softirq.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/softirq.c 2007-07-10 14:55:02.000000000 +1000 @@ -488,7 +488,7 @@ static int ksoftirqd(void * __bind_cpu) { - set_user_nice(current, 19); + set_user_nice(current, 15); current->flags |= PF_NOFREEZE; set_current_state(TASK_INTERRUPTIBLE); Index: linux-2.6.22-ck1/kernel/workqueue.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/workqueue.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/workqueue.c 2007-07-10 14:55:02.000000000 +1000 @@ -285,8 +285,6 @@ if (!cwq->wq->freezeable) current->flags |= PF_NOFREEZE; - set_user_nice(current, -5); - for (;;) { prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); if (!freezing(current) && Index: linux-2.6.22-ck1/kernel/kthread.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/kthread.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/kthread.c 2007-07-10 14:55:02.000000000 +1000 @@ -223,7 +223,6 @@ ignore_signals(tsk); - set_user_nice(tsk, -5); set_cpus_allowed(tsk, CPU_MASK_ALL); } Index: linux-2.6.22-ck1/kernel/fork.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/fork.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/fork.c 2007-07-10 14:55:20.000000000 +1000 @@ -1063,6 +1063,7 @@ p->io_context = NULL; p->io_wait = NULL; p->audit_context = NULL; + p->mutexes_held = 0; cpuset_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); Index: linux-2.6.22-ck1/kernel/mutex.c =================================================================== --- linux-2.6.22-ck1.orig/kernel/mutex.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/kernel/mutex.c 2007-07-10 14:55:20.000000000 +1000 @@ -60,6 +60,16 @@ static void fastcall noinline __sched __mutex_lock_slowpath(atomic_t *lock_count); +static inline void inc_mutex_count(void) +{ + current->mutexes_held++; +} + +static inline void dec_mutex_count(void) +{ + current->mutexes_held--; +} + /*** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired @@ -89,6 +99,7 @@ * 'unlocked' into 'locked' state. */ __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); + inc_mutex_count(); } EXPORT_SYMBOL(mutex_lock); @@ -114,6 +125,7 @@ * into 'unlocked' state: */ __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + dec_mutex_count(); } EXPORT_SYMBOL(mutex_unlock); @@ -283,9 +295,14 @@ */ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) { + int ret; + might_sleep(); - return __mutex_fastpath_lock_retval + ret = __mutex_fastpath_lock_retval (&lock->count, __mutex_lock_interruptible_slowpath); + if (likely(!ret)) + inc_mutex_count(); + return ret; } EXPORT_SYMBOL(mutex_lock_interruptible); @@ -340,8 +357,12 @@ */ int fastcall __sched mutex_trylock(struct mutex *lock) { - return __mutex_fastpath_trylock(&lock->count, + int ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); + + if (likely(ret)) + inc_mutex_count(); + return ret; } EXPORT_SYMBOL(mutex_trylock); Index: linux-2.6.22-ck1/block/cfq-iosched.c =================================================================== --- linux-2.6.22-ck1.orig/block/cfq-iosched.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/block/cfq-iosched.c 2007-07-10 14:55:21.000000000 +1000 @@ -1276,10 +1276,12 @@ printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); case IOPRIO_CLASS_NONE: /* - * no prio set, place us in the middle of the BE classes + * Select class and ioprio according to policy and nice */ + cfqq->ioprio_class = task_policy_ioprio_class(tsk); cfqq->ioprio = task_nice_ioprio(tsk); - cfqq->ioprio_class = IOPRIO_CLASS_BE; + if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) + cfq_clear_cfqq_idle_window(cfqq); break; case IOPRIO_CLASS_RT: cfqq->ioprio = task_ioprio(tsk); Index: linux-2.6.22-ck1/include/linux/ioprio.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/ioprio.h 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/ioprio.h 2007-07-10 14:55:21.000000000 +1000 @@ -22,7 +22,7 @@ * class, the default for any process. IDLE is the idle scheduling class, it * is only served when no one else is using the disk. */ -enum { +enum ioprio_class { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, @@ -51,8 +51,25 @@ return IOPRIO_PRIO_DATA(task->ioprio); } +static inline enum ioprio_class + task_policy_ioprio_class(struct task_struct *task) +{ + if (rt_task(task)) + return IOPRIO_CLASS_RT; + if (idleprio_task(task)) + return IOPRIO_CLASS_IDLE; + return IOPRIO_CLASS_BE; +} + static inline int task_nice_ioprio(struct task_struct *task) { + if (rt_task(task)) + return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / + (MAX_RT_PRIO + 1); + if (iso_task(task)) + return 0; + if (idleprio_task(task)) + return IOPRIO_BE_NR - 1; return (task_nice(task) + 20) / 5; } Index: linux-2.6.22-ck1/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.22-ck1.orig/Documentation/sysctl/vm.txt 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/Documentation/sysctl/vm.txt 2007-07-10 14:55:23.000000000 +1000 @@ -22,6 +22,8 @@ - dirty_background_ratio - dirty_expire_centisecs - dirty_writeback_centisecs +- hardmaplimit +- mapped - max_map_count - min_free_kbytes - laptop_mode @@ -31,12 +33,15 @@ - min_unmapped_ratio - min_slab_ratio - panic_on_oom +- swap_prefetch +- swap_prefetch_delay +- swap_prefetch_sleep ============================================================== dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, -block_dump, swap_token_timeout, drop-caches: +block_dump, swap_token_timeout, drop-caches, tail_largefiles: See Documentation/filesystems/proc.txt @@ -86,6 +91,27 @@ ============================================================== +hardmaplimit: + +This flag makes the vm adhere to the mapped value as closely as possible +except in the most extreme vm stress where doing so would provoke an out +of memory condition (see mapped below). + +Enabled by default. + +============================================================== + +mapped: + +This is the percentage ram that is filled with mapped pages (applications) +before the vm will start reclaiming mapped pages by moving them to swap. +It is altered by the relative stress of the vm at the time so is not +strictly adhered to to prevent provoking out of memory kills. + +Set to 66 by default. + +============================================================== + max_map_count: This file contains the maximum number of memory map areas a process @@ -216,3 +242,37 @@ The default value is 0. 1 and 2 are for failover of clustering. Please select either according to your policy of failover. + +============================================================== + +swap_prefetch + +This enables or disables the swap prefetching feature. When the virtual +memory subsystem has been extremely idle for at least swap_prefetch_sleep +seconds it will start copying back pages from swap into the swapcache and keep +a copy in swap. Valid values are 0 - 3. A value of 0 disables swap +prefetching, 1 enables it unless laptop_mode is enabled, 2 enables it in the +presence of laptop_mode, and 3 enables it unconditionally, ignoring whether +the system is idle or not. If set to 0, swap prefetch wil not even try to keep +record of ram swapped out to have the most minimal impact on performance. + +The default value is 1. + +============================================================== + +swap_prefetch_delay + +This is the time in seconds that swap prefetching is delayed upon finding +the system is not idle (ie the vm is busy or non-niced cpu load is present). + +The default value is 1. + +============================================================== + +swap_prefetch_sleep + +This is the time in seconds that the swap prefetch kernel thread is put to +sleep for when the ram is found to be full and it is unable to prefetch +further. + +The default value is 5. Index: linux-2.6.22-ck1/include/linux/swap.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/swap.h 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/swap.h 2007-07-10 14:55:22.000000000 +1000 @@ -180,6 +180,7 @@ /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); extern void FASTCALL(lru_cache_add_active(struct page *)); +extern void FASTCALL(lru_cache_add_tail(struct page *)); extern void FASTCALL(activate_page(struct page *)); extern void FASTCALL(mark_page_accessed(struct page *)); extern void lru_add_drain(void); @@ -188,9 +189,11 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern unsigned long try_to_free_pages(struct zone **, gfp_t); +extern unsigned long try_to_free_pages(struct zone **, gfp_t, + struct task_struct *p); extern unsigned long shrink_all_memory(unsigned long nr_pages); -extern int vm_swappiness; +extern int vm_mapped; +extern int vm_hardmaplimit; extern int remove_mapping(struct address_space *mapping, struct page *page); extern long vm_total_pages; @@ -237,6 +240,7 @@ extern struct page * lookup_swap_cache(swp_entry_t); extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, unsigned long addr); +extern int add_to_swap_cache(struct page *page, swp_entry_t entry); /* linux/mm/swapfile.c */ extern long total_swap_pages; extern unsigned int nr_swapfiles; Index: linux-2.6.22-ck1/init/Kconfig =================================================================== --- linux-2.6.22-ck1.orig/init/Kconfig 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/init/Kconfig 2007-07-10 14:55:22.000000000 +1000 @@ -105,6 +105,28 @@ used to provide more virtual memory than the actual RAM present in your computer. If unsure say Y. +config SWAP_PREFETCH + bool "Support for prefetching swapped memory" + depends on SWAP + default y + ---help--- + This option will allow the kernel to prefetch swapped memory pages + when idle. The pages will be kept on both swap and in swap_cache + thus avoiding the need for further I/O if either ram or swap space + is required. + + What this will do on workstations is slowly bring back applications + that have swapped out after memory intensive workloads back into + physical ram if you have free ram at a later stage and the machine + is relatively idle. This means that when you come back to your + computer after leaving it idle for a while, applications will come + to life faster. Note that your swap usage will appear to increase + but these are cached pages, can be dropped freely by the vm, and it + should stabilise around 50% swap usage maximum. + + Workstations and multiuser workstation servers will most likely want + to say Y. + config SYSVIPC bool "System V IPC" ---help--- Index: linux-2.6.22-ck1/mm/Makefile =================================================================== --- linux-2.6.22-ck1.orig/mm/Makefile 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/mm/Makefile 2007-07-10 14:55:22.000000000 +1000 @@ -17,6 +17,7 @@ obj-y += bounce.o endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o +obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o Index: linux-2.6.22-ck1/mm/swap.c =================================================================== --- linux-2.6.22-ck1.orig/mm/swap.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/mm/swap.c 2007-07-10 14:55:23.000000000 +1000 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -176,6 +177,7 @@ */ static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; void fastcall lru_cache_add(struct page *page) { @@ -197,6 +199,31 @@ put_cpu_var(lru_add_active_pvecs); } +static void __pagevec_lru_add_tail(struct pagevec *pvec) +{ + int i; + struct zone *zone = NULL; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + BUG_ON(PageLRU(page)); + SetPageLRU(page); + add_page_to_inactive_list_tail(zone, page); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); +} + static void __lru_add_drain(int cpu) { struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); @@ -207,6 +234,9 @@ pvec = &per_cpu(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); + pvec = &per_cpu(lru_add_tail_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_tail(pvec); } void lru_add_drain(void) @@ -403,6 +433,20 @@ } /* + * Function used uniquely to put pages back to the lru at the end of the + * inactive list to preserve the lru order. + */ +void fastcall lru_cache_add_tail(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add_tail(pvec); + put_cpu_var(lru_add_pvecs); +} + +/* * Try to drop buffers from the pages in a pagevec */ void pagevec_strip(struct pagevec *pvec) @@ -514,6 +558,9 @@ * Right now other parts of the system means that we * _really_ don't want to cluster much more */ + + prepare_swap_prefetch(); + #ifdef CONFIG_HOTPLUG_CPU hotcpu_notifier(cpu_swap_callback, 0); #endif Index: linux-2.6.22-ck1/mm/swap_prefetch.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-ck1/mm/swap_prefetch.c 2007-07-10 14:55:22.000000000 +1000 @@ -0,0 +1,542 @@ +/* + * linux/mm/swap_prefetch.c + * + * Copyright (C) 2005-2007 Con Kolivas + * + * Written by Con Kolivas + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * sysctls: + * swap_prefetch: 0. Disable swap prefetching + * 1. Prefetch only when idle and not with laptop_mode + * 2. Prefetch when idle and with laptop_mode + * 3. Prefetch at all times. + * swap_prefetch_delay: Number of seconds to delay prefetching when system + * is not idle. + * swap_prefetch_sleep: Number of seconds to put kprefetchd to sleep when + * unable to prefetch. + */ +int swap_prefetch __read_mostly = 1; +int swap_prefetch_delay __read_mostly = 1; +int swap_prefetch_sleep __read_mostly = 5; + +#define PREFETCH_DELAY (HZ * swap_prefetch_delay) +#define PREFETCH_SLEEP ((HZ * swap_prefetch_sleep) ? : 1) + +struct swapped_root { + unsigned long busy; /* vm busy */ + spinlock_t lock; /* protects all data */ + struct list_head list; /* MRU list of swapped pages */ + struct radix_tree_root swap_tree; /* Lookup tree of pages */ + unsigned int count; /* Number of entries */ + unsigned int maxcount; /* Maximum entries allowed */ + struct kmem_cache *cache; /* Of struct swapped_entry */ +}; + +static struct swapped_root swapped = { + .lock = SPIN_LOCK_UNLOCKED, + .list = LIST_HEAD_INIT(swapped.list), + .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), +}; + +static struct task_struct *kprefetchd_task; + +/* + * We check to see no part of the vm is busy. If it is this will interrupt + * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. + */ +inline void delay_swap_prefetch(void) +{ + if (!test_bit(0, &swapped.busy)) + __set_bit(0, &swapped.busy); +} + +/* + * If laptop_mode is enabled don't prefetch to avoid hard drives + * doing unnecessary spin-ups unless swap_prefetch is explicitly + * set to a higher value. + */ +static inline int prefetch_enabled(void) +{ + if (swap_prefetch <= laptop_mode) + return 0; + return 1; +} + +static int kprefetchd_awake; + +/* + * Drop behind accounting which keeps a list of the most recently used swap + * entries. Entries are removed lazily by kprefetchd. + */ +void add_to_swapped_list(struct page *page) +{ + struct swapped_entry *entry; + unsigned long index, flags; + + if (!prefetch_enabled()) + goto out; + + spin_lock_irqsave(&swapped.lock, flags); + if (swapped.count >= swapped.maxcount) { + /* + * Once the number of entries exceeds maxcount we start + * removing the least recently used entries. + */ + entry = list_entry(swapped.list.next, + struct swapped_entry, swapped_list); + radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); + list_del(&entry->swapped_list); + swapped.count--; + } else { + entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); + if (unlikely(!entry)) + /* bad, can't allocate more mem */ + goto out_locked; + } + + index = page_private(page); + entry->swp_entry.val = index; + /* + * On numa we need to store the node id to ensure that we prefetch to + * the same node it came from. + */ + store_swap_entry_node(entry, page); + + if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { + list_add(&entry->swapped_list, &swapped.list); + swapped.count++; + } else + kmem_cache_free(swapped.cache, entry); + +out_locked: + spin_unlock_irqrestore(&swapped.lock, flags); +out: + if (!kprefetchd_awake) + wake_up_process(kprefetchd_task); + return; +} + +/* + * Removes entries from the swapped_list. The radix tree allows us to quickly + * look up the entry from the index without having to iterate over the whole + * list. + */ +static void remove_from_swapped_list(const unsigned long index) +{ + struct swapped_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&swapped.lock, flags); + entry = radix_tree_delete(&swapped.swap_tree, index); + if (likely(entry)) { + list_del(&entry->swapped_list); + swapped.count--; + kmem_cache_free(swapped.cache, entry); + } + spin_unlock_irqrestore(&swapped.lock, flags); +} + +enum trickle_return { + TRICKLE_SUCCESS, + TRICKLE_FAILED, + TRICKLE_DELAY, +}; + +struct node_stats { + /* Free ram after a cycle of prefetching */ + unsigned long last_free; + /* Free ram on this cycle of checking prefetch_suitable */ + unsigned long current_free; + /* The amount of free ram before we start prefetching */ + unsigned long highfree[MAX_NR_ZONES]; + /* The amount of free ram where we will stop prefetching */ + unsigned long lowfree[MAX_NR_ZONES]; + /* highfree or lowfree depending on whether we've hit a watermark */ + unsigned long *pointfree[MAX_NR_ZONES]; +}; + +/* + * prefetch_stats stores the free ram data of each node and this is used to + * determine if a node is suitable for prefetching into. + */ +struct prefetch_stats { + /* Which nodes are currently suited to prefetching */ + nodemask_t prefetch_nodes; + /* Total pages we've prefetched on this wakeup of kprefetchd */ + unsigned long prefetched_pages; + struct node_stats node[MAX_NUMNODES]; +}; + +static struct prefetch_stats sp_stat; + +/* + * This tries to read a swp_entry_t into swap cache for swap prefetching. + * If it returns TRICKLE_DELAY we should delay further prefetching. + */ +static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, + const int node) +{ + enum trickle_return ret = TRICKLE_FAILED; + unsigned long flags; + struct page *page; + + read_lock_irqsave(&swapper_space.tree_lock, flags); + /* Entry may already exist */ + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + read_unlock_irqrestore(&swapper_space.tree_lock, flags); + if (page) + goto out; + + /* + * Get a new page to read from swap. We have already checked the + * watermarks so __alloc_pages will not call on reclaim. + */ + page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); + if (unlikely(!page)) { + ret = TRICKLE_DELAY; + goto out; + } + + if (add_to_swap_cache(page, entry)) { + /* Failed to add to swap cache */ + goto out_release; + } + + /* Add them to the tail of the inactive list to preserve LRU order */ + lru_cache_add_tail(page); + if (unlikely(swap_readpage(NULL, page))) + goto out_release; + + sp_stat.prefetched_pages++; + sp_stat.node[node].last_free--; + + ret = TRICKLE_SUCCESS; +out_release: + page_cache_release(page); +out: + /* + * All entries are removed here lazily. This avoids the cost of + * remove_from_swapped_list during normal swapin. Thus there are + * usually many stale entries. + */ + remove_from_swapped_list(entry.val); + return ret; +} + +static void clear_last_prefetch_free(void) +{ + int node; + + /* + * Reset the nodes suitable for prefetching to all nodes. We could + * update the data to take into account memory hotplug if desired.. + */ + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->last_free = 0; + } +} + +static void clear_current_prefetch_free(void) +{ + int node; + + sp_stat.prefetch_nodes = node_online_map; + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + ns->current_free = 0; + } +} + +/* + * This updates the high and low watermarks of amount of free ram in each + * node used to start and stop prefetching. We prefetch from pages_high * 4 + * down to pages_high * 3. + */ +static void examine_free_limits(void) +{ + struct zone *z; + + for_each_zone(z) { + struct node_stats *ns; + int idx; + + if (!populated_zone(z)) + continue; + + ns = &sp_stat.node[zone_to_nid(z)]; + idx = zone_idx(z); + ns->lowfree[idx] = z->pages_high * 3; + ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; + + if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { + /* + * We've gotten above the high watermark of free pages + * so we can start prefetching till we get to the low + * watermark. + */ + ns->pointfree[idx] = &ns->lowfree[idx]; + } + } +} + +/* + * We want to be absolutely certain it's ok to start prefetching. + */ +static enum trickle_return prefetch_suitable(void) +{ + enum trickle_return ret = TRICKLE_DELAY; + struct zone *z; + int node; + + /* + * If swap_prefetch is set to a high value we can ignore load + * and prefetch whenever we can. Otherwise we test for vm and + * cpu activity. + */ + if (swap_prefetch < 3) { + /* Purposefully racy, may return false positive */ + if (test_bit(0, &swapped.busy)) { + __clear_bit(0, &swapped.busy); + goto out; + } + + /* + * above_background_load is expensive so we only perform it + * every SWAP_CLUSTER_MAX prefetched_pages. + * We test to see if we're above_background_load as disk + * activity even at low priority can cause interrupt induced + * scheduling latencies. + */ + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) && + above_background_load()) + goto out; + } + clear_current_prefetch_free(); + + /* + * Have some hysteresis between where page reclaiming and prefetching + * will occur to prevent ping-ponging between them. + */ + for_each_zone(z) { + struct node_stats *ns; + unsigned long free; + int idx; + + if (!populated_zone(z)) + continue; + + node = zone_to_nid(z); + ns = &sp_stat.node[node]; + idx = zone_idx(z); + + free = zone_page_state(z, NR_FREE_PAGES); + if (free < *ns->pointfree[idx]) { + /* + * Free pages have dropped below the low watermark so + * we won't start prefetching again till we hit the + * high watermark of free pages. + */ + ns->pointfree[idx] = &ns->highfree[idx]; + node_clear(node, sp_stat.prefetch_nodes); + continue; + } + ns->current_free += free; + } + + /* + * We iterate over each node testing to see if it is suitable for + * prefetching and clear the nodemask if it is not. + */ + for_each_node_mask(node, sp_stat.prefetch_nodes) { + struct node_stats *ns = &sp_stat.node[node]; + + /* + * We check to see that pages are not being allocated + * elsewhere at any significant rate implying any + * degree of memory pressure (eg during file reads) + */ + if (ns->last_free) { + if (ns->current_free + SWAP_CLUSTER_MAX < + ns->last_free) { + ns->last_free = ns->current_free; + node_clear(node, + sp_stat.prefetch_nodes); + continue; + } + } else + ns->last_free = ns->current_free; + + /* We shouldn't prefetch when we are doing writeback */ + if (node_page_state(node, NR_WRITEBACK)) + node_clear(node, sp_stat.prefetch_nodes); + } + + /* Nothing suitable, put kprefetchd back to sleep */ + if (nodes_empty(sp_stat.prefetch_nodes)) + return TRICKLE_FAILED; + + /* Survived all that? Hooray we can prefetch! */ + ret = TRICKLE_SUCCESS; +out: + return ret; +} + +/* + * trickle_swap is the main function that initiates the swap prefetching. It + * first checks to see if the busy flag is set, and does not prefetch if it + * is, as the flag implied we are low on memory or swapping in currently. + * Otherwise it runs until prefetch_suitable fails which occurs when the + * vm is busy, we prefetch to the watermark, the list is empty or we have + * iterated over all entries once. + */ +static enum trickle_return trickle_swap(void) +{ + enum trickle_return suitable, ret = TRICKLE_DELAY; + struct swapped_entry *pos, *n; + unsigned long flags; + + if (!prefetch_enabled()) + return ret; + + examine_free_limits(); + suitable = prefetch_suitable(); + if (suitable != TRICKLE_SUCCESS) + return suitable; + if (list_empty(&swapped.list)) { + kprefetchd_awake = 0; + return TRICKLE_FAILED; + } + + spin_lock_irqsave(&swapped.lock, flags); + list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) { + swp_entry_t swp_entry; + int node; + + spin_unlock_irqrestore(&swapped.lock, flags); + cond_resched(); + suitable = prefetch_suitable(); + if (suitable != TRICKLE_SUCCESS) { + ret = suitable; + goto out_unlocked; + } + + spin_lock_irqsave(&swapped.lock, flags); + if (unlikely(!pos)) + continue; + node = get_swap_entry_node(pos); + if (!node_isset(node, sp_stat.prefetch_nodes)) { + /* + * We found an entry that belongs to a node that is + * not suitable for prefetching so skip it. + */ + continue; + } + swp_entry = pos->swp_entry; + spin_unlock_irqrestore(&swapped.lock, flags); + + if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) + goto out_unlocked; + spin_lock_irqsave(&swapped.lock, flags); + } + spin_unlock_irqrestore(&swapped.lock, flags); + +out_unlocked: + if (sp_stat.prefetched_pages) { + lru_add_drain(); + sp_stat.prefetched_pages = 0; + } + return ret; +} + +static int kprefetchd(void *__unused) +{ + struct sched_param param = { .sched_priority = 0 }; + + sched_setscheduler(current, SCHED_BATCH, ¶m); + set_user_nice(current, 19); + /* Set ioprio to lowest if supported by i/o scheduler */ + sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE); + + while (!kthread_should_stop()) { + try_to_freeze(); + + if (!kprefetchd_awake) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + kprefetchd_awake = 1; + } + + if (trickle_swap() == TRICKLE_FAILED) + schedule_timeout_interruptible(PREFETCH_SLEEP); + else + schedule_timeout_interruptible(PREFETCH_DELAY); + clear_last_prefetch_free(); + } + return 0; +} + +/* + * Create kmem cache for swapped entries + */ +void __init prepare_swap_prefetch(void) +{ + struct zone *zone; + + swapped.cache = kmem_cache_create("swapped_entry", + sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); + + /* + * We set the limit to more entries than the physical ram. + * We remove entries lazily so we need some headroom. + */ + swapped.maxcount = nr_free_pagecache_pages() * 2; + + for_each_zone(zone) { + struct node_stats *ns; + int idx; + + if (!populated_zone(zone)) + continue; + + ns = &sp_stat.node[zone_to_nid(zone)]; + idx = zone_idx(zone); + ns->pointfree[idx] = &ns->highfree[idx]; + } +} + +static int __init kprefetchd_init(void) +{ + kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); + + return 0; +} + +static void __exit kprefetchd_exit(void) +{ + kthread_stop(kprefetchd_task); +} + +module_init(kprefetchd_init); +module_exit(kprefetchd_exit); Index: linux-2.6.22-ck1/mm/swap_state.c =================================================================== --- linux-2.6.22-ck1.orig/mm/swap_state.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/mm/swap_state.c 2007-07-10 14:55:22.000000000 +1000 @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,7 @@ return error; } -static int add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -148,6 +149,9 @@ swp_entry_t entry; int err; + /* Swap prefetching is delayed if we're swapping pages */ + delay_swap_prefetch(); + BUG_ON(!PageLocked(page)); for (;;) { @@ -320,6 +324,9 @@ struct page *found_page, *new_page = NULL; int err; + /* Swap prefetching is delayed if we're already reading from swap */ + delay_swap_prefetch(); + do { /* * First check the swap cache. Since this is normally Index: linux-2.6.22-ck1/mm/vmscan.c =================================================================== --- linux-2.6.22-ck1.orig/mm/vmscan.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/mm/vmscan.c 2007-07-10 14:55:23.000000000 +1000 @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include #include #include +#include #include #include @@ -63,7 +65,7 @@ * whole list at once. */ int swap_cluster_max; - int swappiness; + int mapped; int all_unreclaimable; }; @@ -110,9 +112,10 @@ #endif /* - * From 0 .. 100. Higher means more swappy. + * From 0 .. 100. Lower means more swappy. */ -int vm_swappiness = 60; +int vm_mapped __read_mostly = 66; +int vm_hardmaplimit __read_mostly = 1; long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); @@ -803,10 +806,14 @@ * The distress ratio is important - we don't want to start * going oom. * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. + * This distress value is ignored if we apply a hardmaplimit except + * in extreme distress. + * + * A 0% value of vm_mapped overrides this algorithm altogether. */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; + swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); + if (!vm_hardmaplimit || distress == 100) + swap_tendency += distress; /* * Now use this metric to decide whether to start moving mapped @@ -955,6 +962,41 @@ } /* + * Helper functions to adjust nice level of kswapd, based on the priority of + * the task (p) that called it. If it is already higher priority we do not + * demote its nice level since it is still working on behalf of a higher + * priority task. With kernel threads we leave it at nice 0. + * + * We don't ever run kswapd real time, so if a real time task calls kswapd we + * set it to highest SCHED_NORMAL priority. + */ +static int effective_sc_prio(struct task_struct *p) +{ + if (likely(p->mm)) { + if (rt_task(p)) + return -20; + if (idleprio_task(p)) + return 19; + return task_nice(p); + } + return 0; +} + +static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, + int active) +{ + long nice = effective_sc_prio(p); + + if (task_nice(kswapd) > nice || !active) + set_user_nice(kswapd, nice); +} + +static int sc_priority(struct task_struct *p) +{ + return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); +} + +/* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. @@ -1011,7 +1053,8 @@ * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, + struct task_struct *p) { int priority; int ret = 0; @@ -1019,15 +1062,20 @@ unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; - int i; + int i, scan_priority = DEF_PRIORITY; struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; + if (p) + scan_priority = sc_priority(p); + + delay_swap_prefetch(); + count_vm_event(ALLOCSTALL); for (i = 0; zones[i] != NULL; i++) { @@ -1040,7 +1088,7 @@ + zone_page_state(zone, NR_INACTIVE); } - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + for (priority = scan_priority; priority >= 0; priority--) { sc.nr_scanned = 0; if (!priority) disable_swap_token(); @@ -1070,7 +1118,7 @@ } /* Take a nap, wait for some writeback to complete */ - if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + if (sc.nr_scanned && priority < scan_priority - 2) congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ @@ -1120,9 +1168,9 @@ */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { - int all_zones_ok; + int all_zones_ok = 0; int priority; - int i; + int i, scan_priority; unsigned long total_scanned; unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -1130,7 +1178,7 @@ .gfp_mask = GFP_KERNEL, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; /* * temp_priority is used to remember the scanning priority at which @@ -1138,6 +1186,8 @@ */ int temp_priority[MAX_NR_ZONES]; + scan_priority = sc_priority(pgdat->kswapd); + loop_again: total_scanned = 0; nr_reclaimed = 0; @@ -1145,9 +1195,9 @@ count_vm_event(PAGEOUTRUN); for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; + temp_priority[i] = scan_priority; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + for (priority = scan_priority; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; @@ -1163,15 +1213,22 @@ */ for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; + unsigned long watermark; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != scan_priority) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + /* + * The watermark is relaxed depending on the + * level of "priority" till it drops to + * pages_high. + */ + watermark = zone->pages_high + (zone->pages_high * + priority / scan_priority); + if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { end_zone = i; break; } @@ -1198,14 +1255,18 @@ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + unsigned long watermark; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != scan_priority) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, + watermark = zone->pages_high + (zone->pages_high * + priority / scan_priority); + + if (!zone_watermark_ok(zone, order, watermark, end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; @@ -1238,7 +1299,7 @@ * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) + if (total_scanned && priority < scan_priority - 2) congestion_wait(WRITE, HZ/10); /* @@ -1272,6 +1333,8 @@ return nr_reclaimed; } +#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -1319,6 +1382,8 @@ for ( ; ; ) { unsigned long new_order; + /* kswapd has been busy so delay watermark_timer */ + mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; pgdat->kswapd_max_order = 0; @@ -1332,6 +1397,7 @@ if (!freezing(current)) schedule(); + set_user_nice(tsk, 0); order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); @@ -1349,9 +1415,10 @@ /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) { pg_data_t *pgdat; + int active; if (!populated_zone(zone)) return; @@ -1363,7 +1430,9 @@ pgdat->kswapd_max_order = order; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; - if (!waitqueue_active(&pgdat->kswapd_wait)) + active = waitqueue_active(&pgdat->kswapd_wait); + set_kswapd_nice(pgdat->kswapd, p, active); + if (!active) return; wake_up_interruptible(&pgdat->kswapd_wait); } @@ -1382,6 +1451,8 @@ struct zone *zone; unsigned long nr_to_scan, ret = 0; + delay_swap_prefetch(); + for_each_zone(zone) { if (!populated_zone(zone)) @@ -1441,7 +1512,7 @@ .may_swap = 0, .swap_cluster_max = nr_pages, .may_writepage = 1, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; current->reclaim_state = &reclaim_state; @@ -1476,7 +1547,7 @@ /* Force reclaiming mapped pages in the passes #3 and #4 */ if (pass > 2) { sc.may_swap = 1; - sc.swappiness = 100; + sc.mapped = 0; } for (prio = DEF_PRIORITY; prio >= 0; prio--) { @@ -1540,20 +1611,57 @@ } /* + * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots + */ +static void watermark_wakeup(unsigned long data) +{ + pg_data_t *pgdat = (pg_data_t *)data; + struct timer_list *wt = &pgdat->watermark_timer; + int i; + + if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) + goto out; + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *z = pgdat->node_zones + i; + + if (!populated_zone(z) || is_highmem(z)) { + /* We are better off leaving highmem full */ + continue; + } + if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { + wake_up_interruptible(&pgdat->kswapd_wait); + goto out; + } + } +out: + mod_timer(wt, jiffies + WT_EXPIRY); + return; +} + +/* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. */ int kswapd_run(int nid) { pg_data_t *pgdat = NODE_DATA(nid); + struct timer_list *wt; int ret = 0; if (pgdat->kswapd) return 0; + wt = &pgdat->watermark_timer; + init_timer(wt); + wt->data = (unsigned long)pgdat; + wt->function = watermark_wakeup; + wt->expires = jiffies + WT_EXPIRY; + add_timer(wt); + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + del_timer(wt); BUG_ON(system_state == SYSTEM_BOOTING); printk("Failed to start kswapd on node %d\n",nid); ret = -1; @@ -1624,7 +1732,7 @@ .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, - .swappiness = vm_swappiness, + .mapped = vm_mapped, }; unsigned long slab_reclaimable; Index: linux-2.6.22-ck1/include/linux/mm_inline.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/mm_inline.h 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/mm_inline.h 2007-07-10 14:55:22.000000000 +1000 @@ -13,6 +13,13 @@ } static inline void +add_page_to_inactive_list_tail(struct zone *zone, struct page *page) +{ + list_add_tail(&page->lru, &zone->inactive_list); + __inc_zone_state(zone, NR_INACTIVE); +} + +static inline void del_page_from_active_list(struct zone *zone, struct page *page) { list_del(&page->lru); Index: linux-2.6.22-ck1/include/linux/swap-prefetch.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.22-ck1/include/linux/swap-prefetch.h 2007-07-10 14:55:22.000000000 +1000 @@ -0,0 +1,53 @@ +#ifndef SWAP_PREFETCH_H_INCLUDED +#define SWAP_PREFETCH_H_INCLUDED + +#ifdef CONFIG_SWAP_PREFETCH +/* mm/swap_prefetch.c */ +extern int swap_prefetch; +extern int swap_prefetch_delay; +extern int swap_prefetch_sleep; + +struct swapped_entry { + swp_entry_t swp_entry; /* The actual swap entry */ + struct list_head swapped_list; /* Linked list of entries */ +#if MAX_NUMNODES > 1 + int node; /* Node id */ +#endif +} __attribute__((packed)); + +static inline void store_swap_entry_node(struct swapped_entry *entry, + struct page *page) +{ +#if MAX_NUMNODES > 1 + entry->node = page_to_nid(page); +#endif +} + +static inline int get_swap_entry_node(struct swapped_entry *entry) +{ +#if MAX_NUMNODES > 1 + return entry->node; +#else + return 0; +#endif +} + +extern void add_to_swapped_list(struct page *page); +extern void delay_swap_prefetch(void); +extern void prepare_swap_prefetch(void); + +#else /* CONFIG_SWAP_PREFETCH */ +static inline void add_to_swapped_list(struct page *__unused) +{ +} + +static inline void prepare_swap_prefetch(void) +{ +} + +static inline void delay_swap_prefetch(void) +{ +} +#endif /* CONFIG_SWAP_PREFETCH */ + +#endif /* SWAP_PREFETCH_H_INCLUDED */ Index: linux-2.6.22-ck1/mm/page_io.c =================================================================== --- linux-2.6.22-ck1.orig/mm/page_io.c 2007-07-10 14:55:00.000000000 +1000 +++ linux-2.6.22-ck1/mm/page_io.c 2007-07-10 14:55:22.000000000 +1000 @@ -17,6 +17,7 @@ #include #include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, @@ -118,6 +119,7 @@ ret = -ENOMEM; goto out; } + add_to_swapped_list(page); if (wbc->sync_mode == WB_SYNC_ALL) rw |= (1 << BIO_RW_SYNC); count_vm_event(PSWPOUT); Index: linux-2.6.22-ck1/include/linux/sysctl.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/sysctl.h 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/sysctl.h 2007-07-10 14:55:22.000000000 +1000 @@ -190,7 +190,7 @@ VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ - VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_UNUSED19=19, /* was: Tendency to steal mapped memory */ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ Index: linux-2.6.22-ck1/include/linux/mmzone.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/mmzone.h 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/mmzone.h 2007-07-10 14:55:23.000000000 +1000 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -181,7 +182,7 @@ struct zone { /* Fields commonly accessed by the page allocator */ - unsigned long pages_min, pages_low, pages_high; + unsigned long pages_min, pages_low, pages_high, pages_lots; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several @@ -452,6 +453,7 @@ wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + struct timer_list watermark_timer; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -468,7 +470,7 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); void build_all_zonelists(void); -void wakeup_kswapd(struct zone *zone, int order); +void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { Index: linux-2.6.22-ck1/mm/page_alloc.c =================================================================== --- linux-2.6.22-ck1.orig/mm/page_alloc.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/mm/page_alloc.c 2007-07-10 14:55:22.000000000 +1000 @@ -1250,7 +1250,7 @@ goto nopage; for (z = zonelist->zones; *z; z++) - wakeup_kswapd(*z, order); + wakeup_kswapd(*z, order, p); /* * OK, we're below the kswapd watermark and have kicked background @@ -1314,7 +1314,7 @@ reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); + did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -1570,6 +1570,7 @@ " min:%lukB" " low:%lukB" " high:%lukB" + " lots:%lukB" " active:%lukB" " inactive:%lukB" " present:%lukB" @@ -1581,6 +1582,7 @@ K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), + K(zone->pages_lots), K(zone_page_state(zone, NR_ACTIVE)), K(zone_page_state(zone, NR_INACTIVE)), K(zone->present_pages), @@ -3142,6 +3144,7 @@ zone->pages_low = zone->pages_min + (tmp >> 2); zone->pages_high = zone->pages_min + (tmp >> 1); + zone->pages_lots = zone->pages_min + tmp; spin_unlock_irqrestore(&zone->lru_lock, flags); } Index: linux-2.6.22-ck1/fs/buffer.c =================================================================== --- linux-2.6.22-ck1.orig/fs/buffer.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/fs/buffer.c 2007-07-10 14:55:22.000000000 +1000 @@ -356,7 +356,7 @@ for_each_online_pgdat(pgdat) { zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; if (*zones) - try_to_free_pages(zones, GFP_NOFS); + try_to_free_pages(zones, GFP_NOFS, NULL); } } Index: linux-2.6.22-ck1/mm/filemap.c =================================================================== --- linux-2.6.22-ck1.orig/mm/filemap.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/mm/filemap.c 2007-07-10 14:55:23.000000000 +1000 @@ -466,6 +466,16 @@ return ret; } +int add_to_page_cache_lru_tail(struct page *page, + struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) +{ + int ret = add_to_page_cache(page, mapping, offset, gfp_mask); + + if (ret == 0) + lru_cache_add_tail(page); + return ret; +} + #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { @@ -839,6 +849,34 @@ ra->ra_pages /= 4; } +/* + * Sysctl which determines whether we should read from large files to the + * tail of the inactive lru list. + */ +int vm_tail_largefiles __read_mostly = 1; + +static inline int nr_mapped(void) +{ + return global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES); +} + +/* + * This examines how large in pages a file size is and returns 1 if it is + * more than half the unmapped ram. Avoid doing read_page_state which is + * expensive unless we already know it is likely to be large enough. + */ +static int large_isize(unsigned long nr_pages) +{ + if (nr_pages * 6 > vm_total_pages) { + unsigned long unmapped_ram = vm_total_pages - nr_mapped(); + + if (nr_pages * 2 > unmapped_ram) + return 1; + } + return 0; +} + /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read @@ -1051,8 +1089,19 @@ goto out; } } - error = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); + + /* + * If we know the file is large we add the pages read to the + * end of the lru as we're unlikely to be able to cache the + * whole file in ram so make those pages the first to be + * dropped if not referenced soon. + */ + if (vm_tail_largefiles && large_isize(end_index)) + error = add_to_page_cache_lru_tail(cached_page, + mapping, index, GFP_KERNEL); + else + error = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); if (error) { if (error == -EEXIST) goto find_page; Index: linux-2.6.22-ck1/Documentation/filesystems/proc.txt =================================================================== --- linux-2.6.22-ck1.orig/Documentation/filesystems/proc.txt 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/Documentation/filesystems/proc.txt 2007-07-10 14:55:23.000000000 +1000 @@ -1333,6 +1333,14 @@ As this is a non-destructive operation and dirty objects are not freeable, the user should run `sync' first. +tail_largefiles +--------------- + +When enabled reads from large files to the tail end of the inactive lru list. +This means that any cache from reading large files is dropped very quickly, +preventing loss of mapped ram and useful pagecache when large files are read. +This does, however, make caching less effective when working with large files. + 2.5 /proc/sys/dev - Device specific parameters ---------------------------------------------- Index: linux-2.6.22-ck1/arch/i386/Kconfig =================================================================== --- linux-2.6.22-ck1.orig/arch/i386/Kconfig 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/arch/i386/Kconfig 2007-07-10 14:55:23.000000000 +1000 @@ -550,7 +550,7 @@ choice depends on EXPERIMENTAL - prompt "Memory split" if EMBEDDED + prompt "Memory split" default VMSPLIT_3G help Select the desired split between kernel and user memory. @@ -569,17 +569,17 @@ option alone! config VMSPLIT_3G - bool "3G/1G user/kernel split" + bool "Default 896MB lowmem (3G/1G user/kernel split)" config VMSPLIT_3G_OPT depends on !HIGHMEM - bool "3G/1G user/kernel split (for full 1G low memory)" + bool "1GB lowmem (3G/1G user/kernel split)" config VMSPLIT_2G - bool "2G/2G user/kernel split" + bool "2GB lowmem (2G/2G user/kernel split)" config VMSPLIT_2G_OPT depends on !HIGHMEM - bool "2G/2G user/kernel split (for full 2G low memory)" + bool "2GB lowmem (2G/2G user/kernel split)" config VMSPLIT_1G - bool "1G/3G user/kernel split" + bool "3GB lowmem (1G/3G user/kernel split)" endchoice config PAGE_OFFSET Index: linux-2.6.22-ck1/kernel/Kconfig.hz =================================================================== --- linux-2.6.22-ck1.orig/kernel/Kconfig.hz 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/kernel/Kconfig.hz 2007-07-10 14:55:24.000000000 +1000 @@ -4,7 +4,7 @@ choice prompt "Timer frequency" - default HZ_250 + default HZ_1000 help Allows the configuration of the timer frequency. It is customary to have the timer interrupt run at 1000 Hz but 100 Hz may be more @@ -13,8 +13,7 @@ contention and cacheline bounces as a result of timer interrupts. Note that the timer interrupt occurs on each processor in an SMP environment leading to NR_CPUS * HZ number of timer interrupts - per second. - + per second.Laptops may also show improved battery life. config HZ_100 bool "100 HZ" @@ -23,13 +22,14 @@ with lots of processors that may show reduced performance if too many timer interrupts are occurring. - config HZ_250 + config HZ_250_NODEFAULT bool "250 HZ" help - 250 Hz is a good compromise choice allowing server performance - while also showing good interactive responsiveness even - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. + 250 HZ is a lousy compromise choice allowing server interactivity + while also showing desktop throughput and no extra power saving on + laptops. Good for when you can't make up your mind. + + Recommend 100 or 1000 instead. config HZ_300 bool "300 HZ" @@ -45,12 +45,76 @@ 1000 Hz is the preferred choice for desktop systems and other systems requiring fast interactive responses to events. + config HZ_1500 + bool "1500 HZ" + help + 1500 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_2000 + bool "2000 HZ" + help + 2000 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_3000 + bool "3000 HZ" + help + 3000 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_4000 + bool "4000 HZ" + help + 4000 Hz is an insane value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_5000 + bool "5000 HZ" + help + 5000 Hz is an obscene value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_7500 + bool "7500 HZ" + help + 7500 Hz is an obscene value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + config HZ_10000 + bool "10000 HZ" + help + 10000 Hz is an obscene value to use to run broken software that is Hz + limited. + + Being over 1000, driver breakage is likely. + + endchoice config HZ int default 100 if HZ_100 - default 250 if HZ_250 + default 250 if HZ_250_NODEFAULT default 300 if HZ_300 default 1000 if HZ_1000 + default 1500 if HZ_1500 + default 2000 if HZ_2000 + default 3000 if HZ_3000 + default 4000 if HZ_4000 + default 5000 if HZ_5000 + default 7500 if HZ_7500 + default 10000 if HZ_10000 Index: linux-2.6.22-ck1/arch/i386/defconfig =================================================================== --- linux-2.6.22-ck1.orig/arch/i386/defconfig 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/arch/i386/defconfig 2007-07-10 14:55:23.000000000 +1000 @@ -226,10 +226,10 @@ # CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 +CONFIG_HZ_1000=y +CONFIG_HZ=1000 # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x100000 Index: linux-2.6.22-ck1/arch/x86_64/defconfig =================================================================== --- linux-2.6.22-ck1.orig/arch/x86_64/defconfig 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/arch/x86_64/defconfig 2007-07-10 14:55:23.000000000 +1000 @@ -185,10 +185,10 @@ CONFIG_SECCOMP=y # CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set -CONFIG_HZ_250=y +# CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set -# CONFIG_HZ_1000 is not set -CONFIG_HZ=250 +CONFIG_HZ_1000=y +CONFIG_HZ=1000 CONFIG_K8_NB=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y Index: linux-2.6.22-ck1/include/linux/jiffies.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/jiffies.h 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/jiffies.h 2007-07-10 14:55:24.000000000 +1000 @@ -29,6 +29,12 @@ # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 +#elif HZ >= 1536 && HZ < 3072 +# define SHIFT_HZ 11 +#elif HZ >= 3072 && HZ < 6144 +# define SHIFT_HZ 12 +#elif HZ >= 6144 && HZ < 12288 +# define SHIFT_HZ 13 #else # error You lose. #endif Index: linux-2.6.22-ck1/include/net/inet_timewait_sock.h =================================================================== --- linux-2.6.22-ck1.orig/include/net/inet_timewait_sock.h 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/include/net/inet_timewait_sock.h 2007-07-10 14:55:24.000000000 +1000 @@ -38,8 +38,8 @@ * If time > 4sec, it is "slow" path, no recycling is required, * so that we select tick to get range about 4 seconds. */ -#if HZ <= 16 || HZ > 4096 -# error Unsupported: HZ <= 16 or HZ > 4096 +#if HZ <= 16 || HZ > 16384 +# error Unsupported: HZ <= 16 or HZ > 16384 #elif HZ <= 32 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) #elif HZ <= 64 @@ -54,8 +54,12 @@ # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) #elif HZ <= 2048 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) -#else +#elif HZ <= 4096 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#elif HZ <= 8192 +# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +#else +# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) #endif /* TIME_WAIT reaping mechanism. */ Index: linux-2.6.22-ck1/init/calibrate.c =================================================================== --- linux-2.6.22-ck1.orig/init/calibrate.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/init/calibrate.c 2007-07-10 14:55:24.000000000 +1000 @@ -122,12 +122,12 @@ printk("Calibrating delay loop (skipped)... " "%lu.%02lu BogoMIPS preset\n", loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100); + (loops_per_jiffy * 10/(50000/HZ)) % 100); } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { printk("Calibrating delay using timer specific routine.. "); printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, + (loops_per_jiffy * 10/(50000/HZ)) % 100, loops_per_jiffy); } else { loops_per_jiffy = (1<<12); @@ -166,7 +166,7 @@ /* Round the value and print it */ printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", loops_per_jiffy/(500000/HZ), - (loops_per_jiffy/(5000/HZ)) % 100, + (loops_per_jiffy * 10/(50000/HZ)) % 100, loops_per_jiffy); } Index: linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c =================================================================== --- linux-2.6.22-ck1.orig/arch/i386/kernel/cpu/proc.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c 2007-07-10 14:55:24.000000000 +1000 @@ -157,7 +157,7 @@ seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); + (c->loops_per_jiffy * 10/(50000/HZ)) % 100); seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); return 0; Index: linux-2.6.22-ck1/arch/i386/kernel/smpboot.c =================================================================== --- linux-2.6.22-ck1.orig/arch/i386/kernel/smpboot.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/arch/i386/kernel/smpboot.c 2007-07-10 14:55:24.000000000 +1000 @@ -1094,7 +1094,7 @@ "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", cpucount+1, bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); + (bogosum * 10/(50000/HZ))%100); Dprintk("Before bogocount - setting activated=1.\n"); Index: linux-2.6.22-ck1/include/linux/nfsd/stats.h =================================================================== --- linux-2.6.22-ck1.orig/include/linux/nfsd/stats.h 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/include/linux/nfsd/stats.h 2007-07-10 14:55:24.000000000 +1000 @@ -35,8 +35,8 @@ }; -/* thread usage wraps very million seconds (approx one fortnight) */ -#define NFSD_USAGE_WRAP (HZ*1000000) +/* thread usage wraps every one hundred thousand seconds (approx one day) */ +#define NFSD_USAGE_WRAP (HZ*100000) #ifdef __KERNEL__ Index: linux-2.6.22-ck1/arch/x86_64/kernel/setup.c =================================================================== --- linux-2.6.22-ck1.orig/arch/x86_64/kernel/setup.c 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/arch/x86_64/kernel/setup.c 2007-07-10 14:55:24.000000000 +1000 @@ -1047,7 +1047,7 @@ seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); + (c->loops_per_jiffy * 10/(50000/HZ)) % 100); if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); Index: linux-2.6.22-ck1/Makefile =================================================================== --- linux-2.6.22-ck1.orig/Makefile 2007-07-10 14:54:59.000000000 +1000 +++ linux-2.6.22-ck1/Makefile 2007-07-10 14:55:24.000000000 +1000 @@ -1,8 +1,9 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 22 -EXTRAVERSION = -NAME = Holy Dancing Manatees, Batman! +EXTRAVERSION = -ck1 +NAME = So long, and thanks for all the fish +JANAME = さようなら、いままで魚をありがとう # *DOCUMENTATION* # To see a list of typical targets execute "make help"