diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix new file mode 100644 index 00000000000..55748d3e9f5 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix @@ -0,0 +1,84 @@ +{ stdenv, fetchurl, perl, mktemp, module_init_tools + + # A list of patches to apply to the kernel. Each element of this list + # should be an attribute set {name, patch} where `name' is a + # symbolic name and `patch' is the actual patch. The patch may + # optionally be compressed with gzip or bzip2. +, kernelPatches ? [] + +, # Whether to build a User-Mode Linux kernel. + userModeLinux ? false + +, # Allows you to set your own kernel version suffix (e.g., + # "-my-kernel"). + localVersion ? "" + +, # Your own kernel configuration file, if you don't want to use the + # default. + kernelConfig ? null + +, # A list of additional statements to be appended to the + # configuration file. + extraConfig ? [] +}: + +assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; + +let + + lib = import ../../../lib; + + version = "2.6.21"; + +in + +stdenv.mkDerivation { + name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; + builder = ./builder.sh; + + src = fetchurl { + url = "http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.21.tar.bz2"; + sha256 = "f187b12d70e0a48ce81f0472dfe9504fb5f0f966be339ac9d57dd2b991a74942"; + }; + + patches = map (p: p.patch) kernelPatches; + extraConfig = + let addNewlines = map (s: "\n" + s + "\n"); + configFromPatches = + map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; + in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + + config = + if kernelConfig != null then kernelConfig else + if userModeLinux then ./config-2.6.21-uml else + if stdenv.system == "i686-linux" then ./config-2.6.21-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.21-x86_64-smp else + abort "No kernel configuration for your platform!"; + + buildInputs = [perl mktemp]; + + arch = + if userModeLinux then "um" else + if stdenv.system == "i686-linux" then "i386" else + if stdenv.system == "x86_64-linux" then "x86_64" else + abort "Platform ${stdenv.system} is not supported."; + + makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; + + inherit module_init_tools; + + allowLocalVersion = false; # don't allow patches to set a suffix + inherit localVersion; # but do allow the user to set one. + + meta = { + description = + (if userModeLinux then + "User-Mode Linux" + else + "The Linux kernel") + + (if kernelPatches == [] then "" else + " (with patches: " + + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) + + ")"); + }; +} diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 new file mode 100644 index 00000000000..0bf63f5aca3 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 @@ -0,0 +1,5040 @@ +Index: linux-2.6.21-ck1/Makefile +=================================================================== +--- linux-2.6.21-ck1.orig/Makefile 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/Makefile 2007-05-04 12:21:37.000000000 +1000 +@@ -1,7 +1,7 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 21 +-EXTRAVERSION = ++EXTRAVERSION = -ck1 + NAME = Nocturnal Monster Puppy + + # *DOCUMENTATION* +Index: linux-2.6.21-ck1/kernel/workqueue.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/workqueue.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/workqueue.c 2007-05-04 12:10:54.000000000 +1000 +@@ -355,8 +355,6 @@ static int worker_thread(void *__cwq) + if (!cwq->freezeable) + current->flags |= PF_NOFREEZE; + +- set_user_nice(current, -5); +- + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); +Index: linux-2.6.21-ck1/fs/proc/array.c +=================================================================== +--- linux-2.6.21-ck1.orig/fs/proc/array.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/fs/proc/array.c 2007-05-04 12:10:54.000000000 +1000 +@@ -165,7 +165,6 @@ static inline char * task_state(struct t + rcu_read_lock(); + buffer += sprintf(buffer, + "State:\t%s\n" +- "SleepAVG:\t%lu%%\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" +@@ -173,7 +172,6 @@ static inline char * task_state(struct t + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), +- (p->sleep_avg/1024)*100/(1020000000/1024), + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, +Index: linux-2.6.21-ck1/include/linux/init_task.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/init_task.h 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/init_task.h 2007-05-04 12:24:19.000000000 +1000 +@@ -102,13 +102,15 @@ extern struct group_info init_groups; + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ ++ .rotation = 0, \ + .policy = SCHED_NORMAL, \ + .cpus_allowed = CPU_MASK_ALL, \ + .mm = NULL, \ + .active_mm = &init_mm, \ + .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .ioprio = 0, \ +- .time_slice = HZ, \ ++ .time_slice = 1000000000, \ ++ .quota = 1000000000, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ + .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ +@@ -135,6 +137,7 @@ extern struct group_info init_groups; + .signal = {{0}}}, \ + .blocked = {{0}}, \ + .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .mutexes_held = 0, \ + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .fs_excl = ATOMIC_INIT(0), \ +Index: linux-2.6.21-ck1/include/linux/sched.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/sched.h 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/sched.h 2007-05-04 12:24:19.000000000 +1000 +@@ -34,9 +34,14 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO 5 + + #ifdef __KERNEL__ + ++#define SCHED_MAX SCHED_IDLEPRIO ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++ + struct sched_param { + int sched_priority; + }; +@@ -149,8 +154,7 @@ extern unsigned long weighted_cpuload(co + #define EXIT_ZOMBIE 16 + #define EXIT_DEAD 32 + /* in tsk->state again */ +-#define TASK_NONINTERACTIVE 64 +-#define TASK_DEAD 128 ++#define TASK_DEAD 64 + + #define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +@@ -522,14 +526,19 @@ struct signal_struct { + + #define MAX_USER_RT_PRIO 100 + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#define PRIO_RANGE (40) ++#define ISO_PRIO (MAX_RT_PRIO - 1) + +-#define MAX_PRIO (MAX_RT_PRIO + 40) ++#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) + +-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_prio(prio) unlikely((prio) < ISO_PRIO) + #define rt_task(p) rt_prio((p)->prio) + #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) + #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++#define iso_task(p) unlikely((p)->policy == SCHED_ISO) ++#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) + + /* + * Some day this will be a full-fledged user tracking system.. +@@ -740,6 +749,22 @@ extern unsigned int max_cache_size; + + #endif /* CONFIG_SMP */ + ++/* ++ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of ++ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a ++ * task of nice 0 or enough lower priority tasks to bring up the ++ * weighted_cpuload ++ */ ++static inline int above_background_load(void) ++{ ++ unsigned long cpu; ++ ++ for_each_online_cpu(cpu) { ++ if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) ++ return 1; ++ } ++ return 0; ++} + + struct io_context; /* See blkdev.h */ + struct cpuset; +@@ -788,13 +813,6 @@ struct mempolicy; + struct pipe_inode_info; + struct uts_namespace; + +-enum sleep_type { +- SLEEP_NORMAL, +- SLEEP_NONINTERACTIVE, +- SLEEP_INTERACTIVE, +- SLEEP_INTERRUPTED, +-}; +- + struct prio_array; + + struct task_struct { +@@ -814,20 +832,33 @@ struct task_struct { + int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; + struct list_head run_list; ++ /* ++ * This bitmap shows what priorities this task has received quota ++ * from for this major priority rotation on its current runqueue. ++ */ ++ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); + struct prio_array *array; ++ /* Which major runqueue rotation did this task run */ ++ unsigned long rotation; + + unsigned short ioprio; + #ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; + #endif +- unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ +- enum sleep_type sleep_type; + + unsigned long policy; + cpumask_t cpus_allowed; +- unsigned int time_slice, first_time_slice; ++ /* ++ * How much this task is entitled to run at the current priority ++ * before being requeued at a lower priority. ++ */ ++ int time_slice; ++ /* Is this the very first time_slice this task has ever run. */ ++ unsigned int first_time_slice; ++ /* How much this task receives at each priority level */ ++ int quota; + + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + struct sched_info sched_info; +@@ -992,6 +1023,7 @@ struct task_struct { + struct held_lock held_locks[MAX_LOCK_DEPTH]; + unsigned int lockdep_recursion; + #endif ++ unsigned long mutexes_held; + + /* journalling filesystem info */ + void *journal_info; +@@ -1156,8 +1188,10 @@ static inline void put_task_struct(struc + #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ + #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ ++#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ + #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ + #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ ++#define PF_NONSLEEP 0x40000000 /* Waiting on in-kernel activity */ + + /* + * Only the _current_ task can read/write to tsk->flags, but other +Index: linux-2.6.21-ck1/kernel/sched.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/sched.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/sched.c 2007-05-04 12:24:22.000000000 +1000 +@@ -16,6 +16,7 @@ + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas + */ + + #include +@@ -52,6 +53,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -83,126 +85,85 @@ unsigned long long __attribute__((weak)) + #define USER_PRIO(p) ((p)-MAX_RT_PRIO) + #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) ++#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) + +-/* +- * Some helpers for converting nanosecond timing to jiffy resolution +- */ +-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) ++/* Some helpers for converting to/from various scales.*/ + #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +- +-/* +- * These are the 'tuning knobs' of the scheduler: +- * +- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), +- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. +- * Timeslices get refilled after they expire. +- */ +-#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +-#define DEF_TIMESLICE (100 * HZ / 1000) +-#define ON_RUNQUEUE_WEIGHT 30 +-#define CHILD_PENALTY 95 +-#define PARENT_PENALTY 100 +-#define EXIT_WEIGHT 3 +-#define PRIO_BONUS_RATIO 25 +-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +-#define INTERACTIVE_DELTA 2 +-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +-#define STARVATION_LIMIT (MAX_SLEEP_AVG) +-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +- +-/* +- * If a task is 'interactive' then we reinsert it in the active +- * array after it has expired its current timeslice. (it will not +- * continue to run immediately, it will still roundrobin with +- * other interactive tasks.) +- * +- * This part scales the interactivity limit depending on niceness. +- * +- * We scale it linearly, offset by the INTERACTIVE_DELTA delta. +- * Here are a few examples of different nice levels: +- * +- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] +- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] +- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] +- * +- * (the X axis represents the possible -5 ... 0 ... +5 dynamic +- * priority range a task can explore, a value of '1' means the +- * task is rated interactive.) +- * +- * Ie. nice +19 tasks can never get 'interactive' enough to be +- * reinserted into the active array. And only heavily CPU-hog nice -20 +- * tasks will be expired. Default nice 0 tasks are somewhere between, +- * it takes some effort for them to get interactive, but it's not +- * too hard. +- */ +- +-#define CURRENT_BONUS(p) \ +- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ +- MAX_SLEEP_AVG) +- +-#define GRANULARITY (10 * HZ / 1000 ? : 1) +- +-#ifdef CONFIG_SMP +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ +- num_online_cpus()) +-#else +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +-#endif +- +-#define SCALE(v1,v1_max,v2_max) \ +- (v1) * (v2_max) / (v1_max) +- +-#define DELTA(p) \ +- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ +- INTERACTIVE_DELTA) +- +-#define TASK_INTERACTIVE(p) \ +- ((p)->prio <= (p)->static_prio - DELTA(p)) +- +-#define INTERACTIVE_SLEEP(p) \ +- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ +- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +- +-#define TASK_PREEMPTS_CURR(p, rq) \ +- ((p)->prio < (rq)->curr->prio) +- +-#define SCALE_PRIO(x, prio) \ +- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) +- +-static unsigned int static_prio_timeslice(int static_prio) +-{ +- if (static_prio < NICE_TO_PRIO(0)) +- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); +- else +- return SCALE_PRIO(DEF_TIMESLICE, static_prio); +-} +- +-/* +- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] +- * to time slice values: [800ms ... 100ms ... 5ms] +- * +- * The higher a thread's priority, the bigger timeslices +- * it gets during one round of execution. But even the lowest +- * priority thread gets MIN_TIMESLICE worth of execution time. ++#define MS_TO_NS(TIME) ((TIME) * 1000000) ++#define MS_TO_US(TIME) ((TIME) * 1000) ++#define US_TO_MS(TIME) ((TIME) / 1000) ++ ++#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 8ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run (over ISO_PERIOD seconds) as real time tasks. ++ * sched_iso_period - sysctl which determines the number of seconds over ++ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are ++ * exceeding their allowable bandwidth. ++*/ ++int sched_iso_cpu __read_mostly = 80; ++int sched_iso_period __read_mostly = 5; ++ ++#define ISO_PERIOD ((sched_iso_period * HZ) + 1) ++ ++/* ++ * This contains a bitmap for each dynamic priority level with empty slots ++ * for the valid priorities each different nice level can have. It allows ++ * us to stagger the slots where differing priorities run in a way that ++ * keeps latency differences between different nice levels at a minimum. ++ * The purpose of a pre-generated matrix is for rapid lookup of next slot in ++ * O(1) time without having to recalculate every time priority gets demoted. ++ * All nice levels use priority slot 39 as this allows less niced tasks to ++ * get all priority slots better than that before expiration is forced. ++ * ie, where 0 means a slot for that priority, priority running from left to ++ * right is from prio 0 to prio 39: ++ * nice -20 0000000000000000000000000000000000000000 ++ * nice -10 1000100010001000100010001000100010010000 ++ * nice 0 1010101010101010101010101010101010101010 ++ * nice 5 1011010110110101101101011011010110110110 ++ * nice 10 1110111011101110111011101110111011101110 ++ * nice 15 1111111011111110111111101111111011111110 ++ * nice 19 1111111111111111111111111111111111111110 + */ ++static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] ++ __read_mostly; + +-static inline unsigned int task_timeslice(struct task_struct *p) +-{ +- return static_prio_timeslice(p->static_prio); +-} ++struct rq; + + /* + * These are the runqueue data structures: + */ +- + struct prio_array { +- unsigned int nr_active; +- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ +- struct list_head queue[MAX_PRIO]; ++ /* Tasks queued at each priority */ ++ struct list_head queue[MAX_PRIO + 1]; ++ ++ /* ++ * The bitmap of priorities queued for this array. While the expired ++ * array will never have realtime tasks on it, it is simpler to have ++ * equal sized bitmaps for a cheap array swap. Include 1 bit for ++ * delimiter. ++ */ ++ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); ++ ++ /* ++ * The best static priority (of the dynamic priority tasks) queued ++ * this array. ++ */ ++ int best_static_prio; ++ ++#ifdef CONFIG_SMP ++ /* For convenience looks back at rq */ ++ struct rq *rq; ++#endif + }; + + /* +@@ -234,14 +195,28 @@ struct rq { + */ + unsigned long nr_uninterruptible; + +- unsigned long expired_timestamp; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; +- struct prio_array *active, *expired, arrays[2]; +- int best_expired_prio; ++ ++ struct prio_array *active, *expired, *idleprio, arrays[2]; ++ unsigned long *dyn_bitmap, *exp_bitmap; ++ ++ /* ++ * The current dynamic priority level this runqueue is at per static ++ * priority level. ++ */ ++ int prio_level[PRIO_RANGE]; ++ ++ /* How many times we have rotated the priority queue */ ++ unsigned long prio_rotation; ++ unsigned long iso_ticks; ++ unsigned short iso_refractory; ++ ++ /* Number of idleprio tasks running */ ++ unsigned long nr_idleprio; + atomic_t nr_iowait; + + #ifdef CONFIG_SMP +@@ -579,12 +554,9 @@ static inline struct rq *this_rq_lock(vo + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + /* + * Called when a process is dequeued from the active array and given +- * the cpu. We should note that with the exception of interactive +- * tasks, the expired queue will become the active queue after the active +- * queue is empty, without explicitly dequeuing and requeuing tasks in the +- * expired queue. (Interactive tasks may be requeued directly to the +- * active queue, thus delaying tasks in the expired queue from running; +- * see scheduler_tick()). ++ * the cpu. We should note that the expired queue will become the active ++ * queue after the active queue is empty, without explicitly dequeuing and ++ * requeuing tasks in the expired queue. + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple +@@ -682,71 +654,304 @@ sched_info_switch(struct task_struct *pr + #define sched_info_switch(t, next) do { } while (0) + #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + ++static int idleprio_suitable(struct task_struct *p) ++{ ++ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && ++ !(p->flags & (PF_NONSLEEP | PF_EXITING))); ++} ++ ++static int idleprio(const struct task_struct *p) ++{ ++ return (p->prio == MAX_PRIO); ++} ++ ++static inline int task_queued(struct task_struct *task) ++{ ++ return !list_empty(&task->run_list); ++} ++ ++static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) ++{ ++ __set_bit(p->prio, p->array->prio_bitmap); ++} ++ + /* +- * Adding/removing a task to/from a priority array: ++ * Removing from a runqueue. + */ +-static void dequeue_task(struct task_struct *p, struct prio_array *array) ++static void dequeue_task(struct task_struct *p, struct rq *rq) + { +- array->nr_active--; +- list_del(&p->run_list); +- if (list_empty(array->queue + p->prio)) +- __clear_bit(p->prio, array->bitmap); ++ list_del_init(&p->run_list); ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio--; ++ else if (list_empty(p->array->queue + p->prio)) ++ __clear_bit(p->prio, p->array->prio_bitmap); + } + +-static void enqueue_task(struct task_struct *p, struct prio_array *array) ++static void reset_first_time_slice(struct task_struct *p) + { +- sched_info_queued(p); +- list_add_tail(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; ++ if (unlikely(p->first_time_slice)) ++ p->first_time_slice = 0; ++} ++ ++/* ++ * The task is being queued on a fresh array so it has its entitlement ++ * bitmap cleared. ++ */ ++static void task_new_array(struct task_struct *p, struct rq *rq, ++ struct prio_array *array) ++{ ++ bitmap_zero(p->bitmap, PRIO_RANGE); ++ p->rotation = rq->prio_rotation; ++ p->time_slice = p->quota; + p->array = array; ++ reset_first_time_slice(p); ++} ++ ++/* Find the first slot from the relevant prio_matrix entry */ ++static int first_prio_slot(struct task_struct *p) ++{ ++ if (unlikely(p->policy == SCHED_BATCH)) ++ return p->static_prio; ++ return SCHED_PRIO(find_first_zero_bit( ++ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); + } + + /* +- * Put task to the end of the run list without the overhead of dequeue +- * followed by enqueue. ++ * In sched_interactive mode priority allocation occurs per process per rq ++ * array swap. In !sched_interactive mode all waking tasks must obey the ++ * current prio level of all other tasks running per array swap. + */ +-static void requeue_task(struct task_struct *p, struct prio_array *array) ++static int minprio(struct rq *rq, int uprio) + { +- list_move_tail(&p->run_list, array->queue + p->prio); ++ if (sched_interactive) ++ return MAX_RT_PRIO; ++ return rq->prio_level[uprio]; + } + +-static inline void +-enqueue_task_head(struct task_struct *p, struct prio_array *array) ++/* ++ * Find the first unused slot by this task that is also in its prio_matrix ++ * level. SCHED_BATCH tasks do not use the priority matrix. They only take ++ * priority slots from their static_prio and above. ++ */ ++static int next_entitled_slot(struct task_struct *p, struct rq *rq) + { +- list_add(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; +- p->array = array; ++ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); ++ struct prio_array *array = rq->active; ++ DECLARE_BITMAP(tmp, PRIO_RANGE); ++ ++ /* ++ * Go straight to expiration if there are higher priority tasks ++ * already expired. ++ */ ++ if (p->static_prio > rq->expired->best_static_prio) ++ return MAX_PRIO; ++ if (!rq->prio_level[uprio]) ++ rq->prio_level[uprio] = MAX_RT_PRIO; ++ /* ++ * Only priorities equal to the prio_level and above for their ++ * static_prio are acceptable, and only if it's not better than ++ * a queued better static_prio's prio_level. ++ */ ++ if (p->static_prio < array->best_static_prio) { ++ if (likely(p->policy != SCHED_BATCH)) ++ array->best_static_prio = p->static_prio; ++ } else if (p->static_prio == array->best_static_prio) { ++ search_prio = minprio(rq, uprio); ++ } else { ++ int i; ++ ++ search_prio = minprio(rq, uprio); ++ /* A bound O(n) function, worst case n is 40 */ ++ for (i = array->best_static_prio; i <= p->static_prio ; i++) { ++ if (!rq->prio_level[USER_PRIO(i)]) ++ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; ++ search_prio = max(search_prio, ++ rq->prio_level[USER_PRIO(i)]); ++ } ++ } ++ if (unlikely(p->policy == SCHED_BATCH)) { ++ search_prio = max(search_prio, p->static_prio); ++ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++ } ++ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); ++ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++} ++ ++static void queue_expired(struct task_struct *p, struct rq *rq) ++{ ++ task_new_array(p, rq, rq->expired); ++ p->prio = p->normal_prio = first_prio_slot(p); ++ if (p->static_prio < rq->expired->best_static_prio) ++ rq->expired->best_static_prio = p->static_prio; ++ reset_first_time_slice(p); + } + ++#ifdef CONFIG_SMP + /* +- * __normal_prio - return the priority that is based on the static +- * priority but is modified by bonuses/penalties. +- * +- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] +- * into the -5 ... 0 ... +5 bonus/penalty range. +- * +- * We use 25% of the full 0...39 priority range so that: +- * +- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. +- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. +- * +- * Both properties are important to certain workloads. ++ * If we're waking up a task that was previously on a different runqueue, ++ * update its data appropriately. Note we may be reading data from src_rq-> ++ * outside of lock, but the occasional inaccurate result should be harmless. + */ ++ static void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++ struct rq *src_rq = p->array->rq; + +-static inline int __normal_prio(struct task_struct *p) ++ if (src_rq == rq) ++ return; ++ /* ++ * Only need to set p->array when p->rotation == rq->prio_rotation as ++ * they will be set in recalc_task_prio when != rq->prio_rotation. ++ */ ++ if (p->rotation == src_rq->prio_rotation) { ++ p->rotation = rq->prio_rotation; ++ if (p->array == src_rq->expired) ++ p->array = rq->expired; ++ else ++ p->array = rq->active; ++ } else ++ p->rotation = 0; ++} ++#else ++static inline void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++} ++#endif ++ ++static inline int isoprio_suitable(struct task_struct *p) + { +- int bonus, prio; ++ return !(p->flags & PF_ISOREF); ++} + +- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; ++static int task_timeslice(struct task_struct *p); + +- prio = p->static_prio - bonus; +- if (prio < MAX_RT_PRIO) +- prio = MAX_RT_PRIO; +- if (prio > MAX_PRIO-1) +- prio = MAX_PRIO-1; +- return prio; ++/* ++ * recalc_task_prio determines what priority a non rt_task will be ++ * queued at. If the task has already been running during this runqueue's ++ * major rotation (rq->prio_rotation) then it continues at the same ++ * priority if it has tick entitlement left. If it does not have entitlement ++ * left, it finds the next priority slot according to its nice value that it ++ * has not extracted quota from. If it has not run during this major ++ * rotation, it starts at the next_entitled_slot and has its bitmap quota ++ * cleared. If it does not have any slots left it has all its slots reset and ++ * is queued on the expired at its first_prio_slot. ++ */ ++static void recalc_task_prio(struct task_struct *p, struct rq *rq) ++{ ++ struct prio_array *array = rq->active; ++ int queue_prio; ++ ++ if (iso_task(p)) { ++ if (isoprio_suitable(p)) { ++ /* ++ * If SCHED_ISO tasks have not used up their real time ++ * quota they have run just better than highest ++ * SCHED_NORMAL priority. Otherwise they run as ++ * SCHED_NORMAL. ++ */ ++ p->prio = p->normal_prio = ISO_PRIO; ++ p->array = rq->active; ++ if (p->time_slice <= 0) ++ p->time_slice = p->quota; ++ return; ++ } else if (p->prio == ISO_PRIO) { ++ /* Just about to be demoted to SCHED_NORMAL */ ++ p->time_slice = 0; ++ } ++ } else if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ /* ++ * If suitable idleprio_tasks are queued at MAX_PRIO ++ * only on the idleprio array. Their time_slice is ++ * their full task_timeslice as they cooperatively ++ * multitask. ++ */ ++ p->prio = p->normal_prio = MAX_PRIO; ++ p->array = rq->idleprio; ++ if (p->time_slice <= 0) ++ p->time_slice = task_timeslice(p); ++ return; ++ } ++ /* ++ * If unsuitable idleprio_tasks are queued equivalent to ++ * nice 19 tasks on the expired array. ++ */ ++ p->flags &= ~PF_NONSLEEP; ++ p->prio = p->normal_prio = MAX_PRIO - 1; ++ p->array = rq->expired; ++ if (p->time_slice <= 0 || p->time_slice > p->quota) ++ p->time_slice = p->quota; ++ return; ++ } ++ ++ update_if_moved(p, rq); ++ if (p->rotation == rq->prio_rotation) { ++ if (p->array == array) { ++ if (p->time_slice > 0) ++ return; ++ p->time_slice = p->quota; ++ } else if (p->array == rq->expired) { ++ queue_expired(p, rq); ++ return; ++ } else ++ task_new_array(p, rq, array); ++ } else ++ task_new_array(p, rq, array); ++ ++ queue_prio = next_entitled_slot(p, rq); ++ if (queue_prio >= MAX_PRIO) { ++ queue_expired(p, rq); ++ return; ++ } ++ p->prio = p->normal_prio = queue_prio; ++ __set_bit(USER_PRIO(p->prio), p->bitmap); ++} ++ ++/* ++ * Adding to a runqueue. The dynamic priority queue that it is added to is ++ * determined by recalc_task_prio() above. ++ */ ++static inline void __enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ if (rt_task(p)) ++ p->array = rq->active; ++ else ++ recalc_task_prio(p, rq); ++ ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio++; ++ sched_info_queued(p); ++ set_dynamic_bit(p, rq); ++} ++ ++static void enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add_tail(&p->run_list, p->array->queue + p->prio); ++} ++ ++static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add(&p->run_list, p->array->queue + p->prio); ++} ++ ++/* ++ * requeue_task is only called when p->static_prio does not change. p->prio ++ * can change with dynamic tasks. ++ */ ++static void requeue_task(struct task_struct *p, struct rq *rq, ++ struct prio_array *old_array, int old_prio) ++{ ++ if (p->array == rq->expired) ++ queue_expired(p, rq); ++ list_move_tail(&p->run_list, p->array->queue + p->prio); ++ if (!rt_task(p)) { ++ if (list_empty(old_array->queue + old_prio)) ++ __clear_bit(old_prio, old_array->prio_bitmap); ++ set_dynamic_bit(p, rq); ++ } + } + + /* +@@ -759,20 +964,29 @@ static inline int __normal_prio(struct t + */ + + /* +- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE +- * If static_prio_timeslice() is ever changed to break this assumption then +- * this code will need modification +- */ +-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +-#define LOAD_WEIGHT(lp) \ +- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +-#define PRIO_TO_LOAD_WEIGHT(prio) \ +- LOAD_WEIGHT(static_prio_timeslice(prio)) +-#define RTPRIO_TO_LOAD_WEIGHT(rp) \ +- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) ++ * task_timeslice - the total duration a task can run during one major ++ * rotation. Returns value in milliseconds as the smallest value can be 1. ++ */ ++static int task_timeslice(struct task_struct *p) ++{ ++ int slice = p->quota; /* quota is in us */ ++ ++ if (!rt_task(p)) ++ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; ++ return US_TO_MS(slice); ++} ++ ++/* ++ * The load weight is basically the task_timeslice in ms. Realtime tasks are ++ * special cased to be proportionately larger than nice -20 by their ++ * rt_priority. The weight for rt tasks can only be arbitrary at best. ++ */ ++#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) + + static void set_load_weight(struct task_struct *p) + { ++ int load_weight; ++ + if (has_rt_policy(p)) { + #ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) +@@ -781,12 +995,19 @@ static void set_load_weight(struct task_ + * Giving its load any weight will skew balancing + * adversely. + */ +- p->load_weight = 0; ++ load_weight = 0; + else + #endif +- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); ++ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else +- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); ++ load_weight = task_timeslice(p); ++ /* ++ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but ++ * still need to be weighted to allow balancing to occur. ++ */ ++ if (likely(!idleprio_task(p))) ++ load_weight *= PRIO_RANGE; ++ p->load_weight = load_weight; + } + + static inline void +@@ -814,28 +1035,38 @@ static inline void dec_nr_running(struct + } + + /* +- * Calculate the expected normal priority: i.e. priority +- * without taking RT-inheritance into account. Might be +- * boosted by interactivity modifiers. Changes upon fork, +- * setprio syscalls, and whenever the interactivity +- * estimator recalculates. ++ * __activate_task - move a task to the runqueue. + */ +-static inline int normal_prio(struct task_struct *p) ++static inline void __activate_task(struct task_struct *p, struct rq *rq) + { +- int prio; ++ enqueue_task(p, rq); ++ inc_nr_running(p, rq); ++} + ++/* ++ * __activate_idle_task - move idle task to the _front_ of runqueue. ++ */ ++static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task_head(p, rq); ++ inc_nr_running(p, rq); ++} ++ ++static inline int normal_prio(struct task_struct *p) ++{ + if (has_rt_policy(p)) +- prio = MAX_RT_PRIO-1 - p->rt_priority; ++ return MAX_RT_PRIO-1 - p->rt_priority; ++ /* Other tasks all have normal_prio set in recalc_task_prio */ ++ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) ++ return p->prio; + else +- prio = __normal_prio(p); +- return prio; ++ return p->static_prio; + } + + /* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might +- * be boosted by RT tasks, or might be boosted by +- * interactivity modifiers. Will be RT if the task got ++ * be boosted by RT tasks as it will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ + static int effective_prio(struct task_struct *p) +@@ -852,111 +1083,41 @@ static int effective_prio(struct task_st + } + + /* +- * __activate_task - move a task to the runqueue. ++ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. ++ * From nice 1 to 19 they are smaller than it only if they are at least one ++ * tick still. Below nice 0 they get progressively larger. ++ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval ++ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. ++ * Value returned is in microseconds. + */ +-static void __activate_task(struct task_struct *p, struct rq *rq) ++static inline unsigned int rr_quota(struct task_struct *p) + { +- struct prio_array *target = rq->active; ++ int nice = TASK_NICE(p), rr = rr_interval; + +- if (batch_task(p)) +- target = rq->expired; +- enqueue_task(p, target); +- inc_nr_running(p, rq); +-} +- +-/* +- * __activate_idle_task - move idle task to the _front_ of runqueue. +- */ +-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +-{ +- enqueue_task_head(p, rq->active); +- inc_nr_running(p, rq); ++ if (!rt_task(p)) { ++ if (nice < -6) { ++ rr *= nice * nice; ++ rr /= 40; ++ } else if (nice > 0) ++ rr = rr / 2 ? : 1; ++ } ++ return MS_TO_US(rr); + } + +-/* +- * Recalculate p->normal_prio and p->prio after having slept, +- * updating the sleep-average too: +- */ +-static int recalc_task_prio(struct task_struct *p, unsigned long long now) ++/* Every time we set the quota we need to set the load weight */ ++static void set_quota(struct task_struct *p) + { +- /* Caller must always ensure 'now >= p->timestamp' */ +- unsigned long sleep_time = now - p->timestamp; +- +- if (batch_task(p)) +- sleep_time = 0; +- +- if (likely(sleep_time > 0)) { +- /* +- * This ceiling is set to the lowest priority that would allow +- * a task to be reinserted into the active array on timeslice +- * completion. +- */ +- unsigned long ceiling = INTERACTIVE_SLEEP(p); +- +- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { +- /* +- * Prevents user tasks from achieving best priority +- * with one single large enough sleep. +- */ +- p->sleep_avg = ceiling; +- /* +- * Using INTERACTIVE_SLEEP() as a ceiling places a +- * nice(0) task 1ms sleep away from promotion, and +- * gives it 700ms to round-robin with no chance of +- * being demoted. This is more than generous, so +- * mark this sleep as non-interactive to prevent the +- * on-runqueue bonus logic from intervening should +- * this task not receive cpu immediately. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else { +- /* +- * Tasks waking from uninterruptible sleep are +- * limited in their sleep_avg rise as they +- * are likely to be waiting on I/O +- */ +- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { +- if (p->sleep_avg >= ceiling) +- sleep_time = 0; +- else if (p->sleep_avg + sleep_time >= +- ceiling) { +- p->sleep_avg = ceiling; +- sleep_time = 0; +- } +- } +- +- /* +- * This code gives a bonus to interactive tasks. +- * +- * The boost works by updating the 'average sleep time' +- * value here, based on ->timestamp. The more time a +- * task spends sleeping, the higher the average gets - +- * and the higher the priority boost gets as well. +- */ +- p->sleep_avg += sleep_time; +- +- } +- if (p->sleep_avg > NS_MAX_SLEEP_AVG) +- p->sleep_avg = NS_MAX_SLEEP_AVG; +- } +- +- return effective_prio(p); ++ p->quota = rr_quota(p); ++ set_load_weight(p); + } + + /* + * activate_task - move a task to the runqueue and do priority recalculation +- * +- * Update all the scheduling statistics stuff. (sleep average +- * calculation, priority modifiers, etc.) + */ + static void activate_task(struct task_struct *p, struct rq *rq, int local) + { +- unsigned long long now; +- +- if (rt_task(p)) +- goto out; ++ unsigned long long now = sched_clock(); + +- now = sched_clock(); + #ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ +@@ -977,32 +1138,9 @@ static void activate_task(struct task_st + (now - p->timestamp) >> 20); + } + +- p->prio = recalc_task_prio(p, now); +- +- /* +- * This checks to make sure it's not an uninterruptible task +- * that is now waking up. +- */ +- if (p->sleep_type == SLEEP_NORMAL) { +- /* +- * Tasks which were woken up by interrupts (ie. hw events) +- * are most likely of interactive nature. So we give them +- * the credit of extending their sleep time to the period +- * of time they spend on the runqueue, waiting for execution +- * on a CPU, first time around: +- */ +- if (in_interrupt()) +- p->sleep_type = SLEEP_INTERRUPTED; +- else { +- /* +- * Normal first-time wakeups get a credit too for +- * on-runqueue time, but it will be weighted down: +- */ +- p->sleep_type = SLEEP_INTERACTIVE; +- } +- } ++ set_quota(p); ++ p->prio = effective_prio(p); + p->timestamp = now; +-out: + __activate_task(p, rq); + } + +@@ -1012,8 +1150,7 @@ out: + static void deactivate_task(struct task_struct *p, struct rq *rq) + { + dec_nr_running(p, rq); +- dequeue_task(p, p->array); +- p->array = NULL; ++ dequeue_task(p, rq); + } + + /* +@@ -1095,7 +1232,7 @@ migrate_task(struct task_struct *p, int + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ +- if (!p->array && !task_running(rq, p)) { ++ if (!task_queued(p) && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } +@@ -1126,7 +1263,7 @@ void wait_task_inactive(struct task_stru + repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ +- if (unlikely(p->array || task_running(rq, p))) { ++ if (unlikely(task_queued(p) || task_running(rq, p))) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); +@@ -1391,6 +1528,31 @@ static inline int wake_idle(int cpu, str + } + #endif + ++/* ++ * We need to have a special definition for an idle runqueue when testing ++ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as ++ * a realtime task in sched_idle_next. ++ */ ++#ifdef CONFIG_HOTPLUG_CPU ++#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) ++#else ++#define rq_idle(rq) ((rq)->curr == (rq)->idle) ++#endif ++ ++static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ return ((p->array == task_rq(p)->active && ++ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); ++} ++ ++static inline void try_preempt(struct task_struct *p, struct rq *rq) ++{ ++ if (task_preempts_curr(p, rq)) ++ resched_task(rq->curr); ++} ++ + /*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread +@@ -1422,7 +1584,7 @@ static int try_to_wake_up(struct task_st + if (!(old_state & state)) + goto out; + +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + cpu = task_cpu(p); +@@ -1515,7 +1677,7 @@ out_set_cpu: + old_state = p->state; + if (!(old_state & state)) + goto out; +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + this_cpu = smp_processor_id(); +@@ -1524,25 +1686,9 @@ out_set_cpu: + + out_activate: + #endif /* CONFIG_SMP */ +- if (old_state == TASK_UNINTERRUPTIBLE) { ++ if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; +- /* +- * Tasks on involuntary sleep don't earn +- * sleep_avg beyond just interactive state. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else +- +- /* +- * Tasks that have marked their sleep as noninteractive get +- * woken up with their sleep average not weighted in an +- * interactive way. +- */ +- if (old_state & TASK_NONINTERACTIVE) +- p->sleep_type = SLEEP_NONINTERACTIVE; + +- +- activate_task(p, rq, cpu == this_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) +@@ -1551,15 +1697,22 @@ out_activate: + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ +- if (!sync || cpu != this_cpu) { +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); +- } ++ activate_task(p, rq, cpu == this_cpu); ++ if (!sync || cpu != this_cpu) ++ try_preempt(p, rq); + success = 1; + + out_running: + p->state = TASK_RUNNING; + out: ++ /* ++ * Special case when freezing we need to reschedule idleprio tasks ++ * as SCHED_NORMAL or else they'll never freeze ++ */ ++ if (idleprio_task(p) && freezing(p) && idleprio(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ } + task_rq_unlock(rq, &flags); + + return success; +@@ -1577,7 +1730,6 @@ int fastcall wake_up_state(struct task_s + return try_to_wake_up(p, state, 0); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p); + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -1605,7 +1757,6 @@ void fastcall sched_fork(struct task_str + p->prio = current->normal_prio; + + INIT_LIST_HEAD(&p->run_list); +- p->array = NULL; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +@@ -1617,30 +1768,31 @@ void fastcall sched_fork(struct task_str + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; + #endif ++ if (unlikely(p->policy == SCHED_FIFO)) ++ goto out; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); +- p->time_slice = (current->time_slice + 1) >> 1; +- /* +- * The remainder of the first timeslice might be recovered by +- * the parent if the child exits early enough. +- */ +- p->first_time_slice = 1; +- current->time_slice >>= 1; +- p->timestamp = sched_clock(); +- if (unlikely(!current->time_slice)) { ++ if (current->time_slice > 0) { ++ current->time_slice /= 2; ++ if (current->time_slice) ++ p->time_slice = current->time_slice; ++ else ++ p->time_slice = 1; + /* +- * This case is rare, it happens when the parent has only +- * a single jiffy left from its timeslice. Taking the +- * runqueue lock is not a problem. ++ * The remainder of the first timeslice might be recovered by ++ * the parent if the child exits early enough. + */ +- current->time_slice = 1; +- task_running_tick(cpu_rq(cpu), current); +- } ++ p->first_time_slice = 1; ++ } else ++ p->time_slice = 0; ++ ++ p->timestamp = sched_clock(); + local_irq_enable(); ++out: + put_cpu(); + } + +@@ -1662,38 +1814,16 @@ void fastcall wake_up_new_task(struct ta + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +- /* +- * We decrease the sleep average of forking parents +- * and children as well, to keep max-interactive tasks +- * from forking tasks that are max-interactive. The parent +- * (current) is done further down, under its lock. +- */ +- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * +- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); +- +- p->prio = effective_prio(p); +- + if (likely(cpu == this_cpu)) { ++ activate_task(p, rq, 1); + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ +- if (unlikely(!current->array)) +- __activate_task(p, rq); +- else { +- p->prio = current->prio; +- p->normal_prio = current->normal_prio; +- list_add_tail(&p->run_list, ¤t->run_list); +- p->array = current->array; +- p->array->nr_active++; +- inc_nr_running(p, rq); +- } + set_need_resched(); +- } else +- /* Run child last */ +- __activate_task(p, rq); ++ } + /* + * We skip the following code due to cpu == this_cpu + * +@@ -1710,19 +1840,16 @@ void fastcall wake_up_new_task(struct ta + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; +- __activate_task(p, rq); +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ activate_task(p, rq, 0); ++ try_preempt(p, rq); + + /* + * Parent and child are on different CPUs, now get the +- * parent runqueue to update the parent's ->sleep_avg: ++ * parent runqueue to update the parent's ->flags: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } +- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * +- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); + } + +@@ -1737,23 +1864,17 @@ void fastcall wake_up_new_task(struct ta + */ + void fastcall sched_exit(struct task_struct *p) + { ++ struct task_struct *parent; + unsigned long flags; + struct rq *rq; + +- /* +- * If the child was a (relative-) CPU hog then decrease +- * the sleep_avg of the parent as well. +- */ +- rq = task_rq_lock(p->parent, &flags); +- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { +- p->parent->time_slice += p->time_slice; +- if (unlikely(p->parent->time_slice > task_timeslice(p))) +- p->parent->time_slice = task_timeslice(p); +- } +- if (p->sleep_avg < p->parent->sleep_avg) +- p->parent->sleep_avg = p->parent->sleep_avg / +- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / +- (EXIT_WEIGHT + 1); ++ parent = p->parent; ++ rq = task_rq_lock(parent, &flags); ++ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { ++ parent->time_slice += p->time_slice; ++ if (unlikely(parent->time_slice > parent->quota)) ++ parent->time_slice = parent->quota; ++ } + task_rq_unlock(rq, &flags); + } + +@@ -2085,23 +2206,17 @@ void sched_exec(void) + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +-static void pull_task(struct rq *src_rq, struct prio_array *src_array, +- struct task_struct *p, struct rq *this_rq, +- struct prio_array *this_array, int this_cpu) ++static void pull_task(struct rq *src_rq, struct task_struct *p, ++ struct rq *this_rq, int this_cpu) + { +- dequeue_task(p, src_array); ++ dequeue_task(p, src_rq); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); +- enqueue_task(p, this_array); ++ enqueue_task(p, this_rq); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; +- /* +- * Note that idle threads have a prio of MAX_PRIO, for this test +- * to be always true for them. +- */ +- if (TASK_PREEMPTS_CURR(p, this_rq)) +- resched_task(this_rq->curr); ++ try_preempt(p, this_rq); + } + + /* +@@ -2144,7 +2259,16 @@ int can_migrate_task(struct task_struct + return 1; + } + +-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) ++static inline int rq_best_prio(struct rq *rq) ++{ ++ int best_prio, exp_prio; ++ ++ best_prio = sched_find_first_bit(rq->dyn_bitmap); ++ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ if (unlikely(best_prio > exp_prio)) ++ best_prio = exp_prio; ++ return best_prio; ++} + + /* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted +@@ -2160,7 +2284,7 @@ static int move_tasks(struct rq *this_rq + { + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; +- struct prio_array *array, *dst_array; ++ struct prio_array *array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; +@@ -2187,31 +2311,29 @@ static int move_tasks(struct rq *this_rq + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ +- if (busiest->expired->nr_active) { +- array = busiest->expired; +- dst_array = this_rq->expired; +- } else { +- array = busiest->active; +- dst_array = this_rq->active; +- } +- ++ array = busiest->expired; + new_array: +- /* Start searching at priority 0: */ +- idx = 0; ++ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ ++ if (array == busiest->expired) ++ idx = MAX_RT_PRIO; ++ else ++ idx = 0; + skip_bitmap: + if (!idx) +- idx = sched_find_first_bit(array->bitmap); ++ idx = sched_find_first_bit(array->prio_bitmap); + else +- idx = find_next_bit(array->bitmap, MAX_PRIO, idx); +- if (idx >= MAX_PRIO) { +- if (array == busiest->expired && busiest->active->nr_active) { ++ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); ++ if (idx == MAX_PRIO) { ++ if (array == busiest->idleprio && busiest->nr_idleprio) ++ goto found_idleprio; ++ if (array == busiest->expired) { + array = busiest->active; +- dst_array = this_rq->active; + goto new_array; + } + goto out; + } + ++found_idleprio: + head = array->queue + idx; + curr = head->prev; + skip_queue: +@@ -2233,11 +2355,22 @@ skip_queue: + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ /* ++ * Occurs either when balancing idleprio tasks or ++ * there really are no more tasks to find. ++ */ ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } + +- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); ++ pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + rem_load_move -= tmp->load_weight; + +@@ -2250,6 +2383,13 @@ skip_queue: + this_best_prio = idx; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } +@@ -3013,11 +3153,36 @@ EXPORT_PER_CPU_SYMBOL(kstat); + /* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ * The value returned from sched_clock() occasionally gives bogus values so ++ * some sanity checking is required. + */ +-static inline void +-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) ++static void ++update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, ++ int tick) + { +- p->sched_time += now - p->last_ran; ++ long time_diff = now - p->last_ran; ++ ++ if (tick) { ++ /* ++ * Called from scheduler_tick() there should be less than two ++ * jiffies worth, and not negative/overflow. ++ */ ++ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) ++ time_diff = JIFFIES_TO_NS(1); ++ } else { ++ /* ++ * Called from context_switch there should be less than one ++ * jiffy worth, and not negative/overflow. There should be ++ * some time banked here so use a nominal 1us. ++ */ ++ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) ++ time_diff = 1000; ++ } ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p != rq->idle && p->policy != SCHED_FIFO) ++ p->time_slice -= time_diff / 1000; ++ p->sched_time += time_diff; + p->last_ran = rq->most_recent_timestamp = now; + } + +@@ -3038,27 +3203,6 @@ unsigned long long current_sched_time(co + } + + /* +- * We place interactive tasks back into the active array, if possible. +- * +- * To guarantee that this does not starve expired tasks we ignore the +- * interactivity of a task if the first expired task had to wait more +- * than a 'reasonable' amount of time. This deadline timeout is +- * load-dependent, as the frequency of array switched decreases with +- * increasing number of running tasks. We also ignore the interactivity +- * if a better static_prio task has expired: +- */ +-static inline int expired_starving(struct rq *rq) +-{ +- if (rq->curr->static_prio > rq->best_expired_prio) +- return 1; +- if (!STARVATION_LIMIT || !rq->expired_timestamp) +- return 0; +- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) +- return 1; +- return 0; +-} +- +-/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() +@@ -3073,7 +3217,7 @@ void account_user_time(struct task_struc + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0 || idleprio_task(p)) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +@@ -3131,87 +3275,94 @@ void account_steal_time(struct task_stru + cpustat->steal = cputime64_add(cpustat->steal, tmp); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p) ++/* ++ * The task has used up its quota of running in this prio_level so it must be ++ * dropped a priority level, all managed by recalc_task_prio(). ++ */ ++static void task_expired_entitlement(struct rq *rq, struct task_struct *p) + { +- if (p->array != rq->active) { +- /* Task has expired but was not scheduled yet */ +- set_tsk_need_resched(p); ++ int overrun; ++ ++ reset_first_time_slice(p); ++ if (rt_task(p)) { ++ p->time_slice += p->quota; ++ list_move_tail(&p->run_list, p->array->queue + p->prio); + return; + } +- spin_lock(&rq->lock); ++ overrun = p->time_slice; ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); + /* +- * The task was running during this tick - update the +- * time slice counter. Note: we do not update a thread's +- * priority until it either goes to sleep or uses up its +- * timeslice. This makes it possible for interactive tasks +- * to use up their timeslices at their highest priority levels. ++ * Subtract any extra time this task ran over its time_slice; ie ++ * overrun will either be 0 or negative. + */ +- if (rt_task(p)) { +- /* +- * RR tasks need a special form of timeslice management. +- * FIFO tasks have no timeslices. +- */ +- if ((p->policy == SCHED_RR) && !--p->time_slice) { +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; +- set_tsk_need_resched(p); ++ p->time_slice += overrun; ++} + +- /* put it at the end of the queue: */ +- requeue_task(p, rq->active); +- } +- goto out_unlock; ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. ++ */ ++static unsigned int test_ret_isorefractory(struct rq *rq) ++{ ++ if (likely(!rq->iso_refractory)) { ++ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) ++ rq->iso_refractory = 1; ++ } else { ++ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) ++ rq->iso_refractory = 0; + } +- if (!--p->time_slice) { +- dequeue_task(p, rq->active); +- set_tsk_need_resched(p); +- p->prio = effective_prio(p); +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; ++ return rq->iso_refractory; ++} + +- if (!rq->expired_timestamp) +- rq->expired_timestamp = jiffies; +- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { +- enqueue_task(p, rq->expired); +- if (p->static_prio < rq->best_expired_prio) +- rq->best_expired_prio = p->static_prio; +- } else +- enqueue_task(p, rq->active); +- } else { +- /* +- * Prevent a too long timeslice allowing a task to monopolize +- * the CPU. We do this by splitting up the timeslice into +- * smaller pieces. +- * +- * Note: this does not mean the task's timeslices expire or +- * get lost in any way, they just might be preempted by +- * another task of equal priority. (one with higher +- * priority would have preempted this task already.) We +- * requeue this task to the end of the list on this priority +- * level, which is in essence a round-robin of tasks with +- * equal priority. +- * +- * This only applies to tasks in the interactive +- * delta range with at least TIMESLICE_GRANULARITY to requeue. +- */ +- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - +- p->time_slice) % TIMESLICE_GRANULARITY(p)) && +- (p->time_slice >= TIMESLICE_GRANULARITY(p)) && +- (p->array == rq->active)) { ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++} + +- requeue_task(p, rq->active); +- set_tsk_need_resched(p); +- } ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { ++ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) ++ rq->iso_ticks += 100; ++ } else ++ no_iso_tick(rq); ++ ++ if (iso_task(p)) { ++ if (unlikely(test_ret_isorefractory(rq))) { ++ if (isoprio_suitable(p)) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Set the PF_ISOREF flag and ++ * force it to reschedule as SCHED_NORMAL ++ * by zeroing its time_slice ++ */ ++ p->flags |= PF_ISOREF; ++ p->time_slice = 0; ++ } ++ } else ++ p->flags &= ~PF_ISOREF; + } +-out_unlock: +- spin_unlock(&rq->lock); ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->time_slice > 0 || p->policy == SCHED_FIFO) ++ return; ++ /* p->time_slice <= 0 */ ++ set_tsk_need_resched(p); ++ if (likely(task_queued(p))) ++ task_expired_entitlement(rq, p); + } + + /* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. +- * +- * It also gets called by the fork code, when changing the parent's +- * timeslices. + */ + void scheduler_tick(void) + { +@@ -3220,10 +3371,14 @@ void scheduler_tick(void) + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + +- update_cpu_clock(p, rq, now); ++ update_cpu_clock(p, rq, now, 1); + ++ spin_lock(&rq->lock); + if (p != rq->idle) + task_running_tick(rq, p); ++ else ++ no_iso_tick(rq); ++ spin_unlock(&rq->lock); + #ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) +@@ -3269,10 +3424,80 @@ EXPORT_SYMBOL(sub_preempt_count); + + #endif + +-static inline int interactive_sleep(enum sleep_type sleep_type) ++static void reset_prio_levels(struct rq *rq) ++{ ++ rq->active->best_static_prio = MAX_PRIO - 1; ++ rq->expired->best_static_prio = MAX_PRIO - 1; ++ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); ++} ++ ++/* ++ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the ++ * idleprio array and if it isn't already active ++ */ ++static struct task_struct *next_idleprio_task(struct rq *rq) + { +- return (sleep_type == SLEEP_INTERACTIVE || +- sleep_type == SLEEP_INTERRUPTED); ++ struct prio_array *array = rq->active; ++ struct list_head *queue; ++ ++ if (array != rq->idleprio) { ++ rq->active = rq->idleprio; ++ rq->expired = array; ++ array = rq->active; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ } ++ rq->prio_rotation++; ++ reset_prio_levels(rq); ++ queue = array->queue + MAX_PRIO; ++ return list_entry(queue->next, struct task_struct, run_list); ++} ++ ++/* ++ * next_dynamic_task finds the next suitable dynamic task. ++ */ ++static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) ++{ ++ struct prio_array *array = rq->active; ++ struct task_struct *next; ++ struct list_head *queue; ++ int nstatic; ++ ++retry: ++ if (unlikely(rq->nr_running == rq->nr_idleprio)) ++ return next_idleprio_task(rq); ++ if (idx >= MAX_PRIO) { ++ /* There are no more tasks in the active array. Swap arrays */ ++ array = rq->expired; ++ rq->expired = rq->active; ++ rq->active = array; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->prio_rotation++; ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ reset_prio_levels(rq); ++ } ++ queue = array->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); ++ if (unlikely(next->time_slice <= 0 && !(iso_task(next) && ++ isoprio_suitable(next)))) { ++ /* ++ * Unlucky enough that this task ran out of time_slice ++ * before it hit a scheduler_tick so it should have its ++ * priority reassessed and choose another task (possibly ++ * the same one) ++ */ ++ task_expired_entitlement(rq, next); ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ goto retry; ++ } ++ next->rotation = rq->prio_rotation; ++ nstatic = next->static_prio; ++ if (nstatic < array->best_static_prio) ++ array->best_static_prio = nstatic; ++ if (idx > rq->prio_level[USER_PRIO(nstatic)]) ++ rq->prio_level[USER_PRIO(nstatic)] = idx; ++ return next; + } + + /* +@@ -3281,13 +3506,11 @@ static inline int interactive_sleep(enum + asmlinkage void __sched schedule(void) + { + struct task_struct *prev, *next; +- struct prio_array *array; + struct list_head *queue; + unsigned long long now; +- unsigned long run_time; +- int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; ++ int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into +@@ -3323,18 +3546,6 @@ need_resched_nonpreemptible: + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); +- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { +- run_time = now - prev->timestamp; +- if (unlikely((long long)(now - prev->timestamp) < 0)) +- run_time = 0; +- } else +- run_time = NS_MAX_SLEEP_AVG; +- +- /* +- * Tasks charged proportionately less run_time at high sleep_avg to +- * delay them losing their interactive status +- */ +- run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + +@@ -3345,8 +3556,10 @@ need_resched_nonpreemptible: + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { +- if (prev->state == TASK_UNINTERRUPTIBLE) ++ if (prev->state == TASK_UNINTERRUPTIBLE) { ++ prev->flags |= PF_NONSLEEP; + rq->nr_uninterruptible++; ++ } + deactivate_task(prev, rq); + } + } +@@ -3356,59 +3569,29 @@ need_resched_nonpreemptible: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; +- rq->expired_timestamp = 0; + goto switch_tasks; + } + } + +- array = rq->active; +- if (unlikely(!array->nr_active)) { +- /* +- * Switch the active and expired arrays. +- */ +- schedstat_inc(rq, sched_switch); +- rq->active = rq->expired; +- rq->expired = array; +- array = rq->active; +- rq->expired_timestamp = 0; +- rq->best_expired_prio = MAX_PRIO; ++ idx = sched_find_first_bit(rq->dyn_bitmap); ++ if (likely(idx > ISO_PRIO)) ++ next = next_dynamic_task(rq, idx); ++ else { ++ queue = rq->active->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); + } +- +- idx = sched_find_first_bit(array->bitmap); +- queue = array->queue + idx; +- next = list_entry(queue->next, struct task_struct, run_list); +- +- if (!rt_task(next) && interactive_sleep(next->sleep_type)) { +- unsigned long long delta = now - next->timestamp; +- if (unlikely((long long)(now - next->timestamp) < 0)) +- delta = 0; +- +- if (next->sleep_type == SLEEP_INTERACTIVE) +- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; +- +- array = next->array; +- new_prio = recalc_task_prio(next, next->timestamp + delta); +- +- if (unlikely(next->prio != new_prio)) { +- dequeue_task(next, array); +- next->prio = new_prio; +- enqueue_task(next, array); +- } +- } +- next->sleep_type = SLEEP_NORMAL; + switch_tasks: +- if (next == rq->idle) ++ if (next == rq->idle) { ++ reset_prio_levels(rq); ++ rq->prio_rotation++; + schedstat_inc(rq, sched_goidle); ++ } + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + +- update_cpu_clock(prev, rq, now); +- +- prev->sleep_avg -= run_time; +- if ((long)prev->sleep_avg <= 0) +- prev->sleep_avg = 0; ++ update_cpu_clock(prev, rq, now, 0); + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); +@@ -3844,29 +4027,22 @@ EXPORT_SYMBOL(sleep_on_timeout); + */ + void rt_mutex_setprio(struct task_struct *p, int prio) + { +- struct prio_array *array; + unsigned long flags; ++ int queued, oldprio; + struct rq *rq; +- int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; +- array = p->array; +- if (array) +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p, rq); + p->prio = prio; + +- if (array) { +- /* +- * If changing to an RT priority then queue it +- * in the active array! +- */ +- if (rt_task(p)) +- array = rq->active; +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on +@@ -3875,8 +4051,8 @@ void rt_mutex_setprio(struct task_struct + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + task_rq_unlock(rq, &flags); + } +@@ -3885,8 +4061,7 @@ void rt_mutex_setprio(struct task_struct + + void set_user_nice(struct task_struct *p, long nice) + { +- struct prio_array *array; +- int old_prio, delta; ++ int queued, old_prio,delta; + unsigned long flags; + struct rq *rq; + +@@ -3907,26 +4082,27 @@ void set_user_nice(struct task_struct *p + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } +- array = p->array; +- if (array) { +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) { ++ dequeue_task(p, rq); + dec_raw_weighted_load(rq, p); + } + + p->static_prio = NICE_TO_PRIO(nice); +- set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); ++ set_quota(p); + delta = p->prio - old_prio; + +- if (array) { +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + inc_raw_weighted_load(rq, p); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ +- if (delta < 0 || (delta > 0 && task_running(rq, p))) ++ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && ++ task_running(rq, p))) + resched_task(rq->curr); + } + out_unlock: +@@ -3996,7 +4172,7 @@ asmlinkage long sys_nice(int increment) + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered +- * around 0, value goes from -16 to +15. ++ * around 0, value goes from 0 to +39. + */ + int task_prio(const struct task_struct *p) + { +@@ -4043,19 +4219,14 @@ static inline struct task_struct *find_p + /* Actually do priority change: must hold rq lock. */ + static void __setscheduler(struct task_struct *p, int policy, int prio) + { +- BUG_ON(p->array); ++ BUG_ON(task_queued(p)); + + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); +- /* +- * SCHED_BATCH tasks are treated as perpetual CPU hogs: +- */ +- if (policy == SCHED_BATCH) +- p->sleep_avg = 0; +- set_load_weight(p); ++ set_quota(p); + } + + /** +@@ -4069,19 +4240,27 @@ static void __setscheduler(struct task_s + int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) + { +- int retval, oldprio, oldpolicy = -1; +- struct prio_array *array; ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ int queued, retval, oldprio, oldpolicy = -1; + unsigned long flags; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } + recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; +- else if (policy != SCHED_FIFO && policy != SCHED_RR && +- policy != SCHED_NORMAL && policy != SCHED_BATCH) ++ else if (!SCHED_RANGE(policy)) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are +@@ -4116,6 +4295,31 @@ recheck: + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy == SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } + } + + /* can't change other user's priorities */ +@@ -4124,6 +4328,11 @@ recheck: + return -EPERM; + } + ++ if (!(p->mm) && policy == SCHED_IDLEPRIO) { ++ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ ++ return -EINVAL; ++ } ++ + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; +@@ -4144,12 +4353,12 @@ recheck: + spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); +- if (array) { ++ if (queued) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and +@@ -4159,14 +4368,15 @@ recheck: + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + ++out: + return 0; + } + EXPORT_SYMBOL_GPL(sched_setscheduler); +@@ -4433,41 +4643,34 @@ asmlinkage long sys_sched_getaffinity(pi + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by moving the calling thread +- * to the expired array. If there are no other threads running on this +- * CPU then this function will return. ++ * to the expired array if SCHED_NORMAL or the end of its current priority ++ * queue if a realtime task. If there are no other threads running on this ++ * cpu this function will return. + */ + asmlinkage long sys_sched_yield(void) + { + struct rq *rq = this_rq_lock(); +- struct prio_array *array = current->array, *target = rq->expired; ++ struct task_struct *p = current; + + schedstat_inc(rq, yld_cnt); +- /* +- * We implement yielding by moving the task into the expired +- * queue. +- * +- * (special rule: RT tasks will just roundrobin in the active +- * array.) +- */ +- if (rt_task(current)) +- target = rq->active; +- +- if (array->nr_active == 1) { +- schedstat_inc(rq, yld_act_empty); +- if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_both_empty); +- } else if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_exp_empty); +- +- if (array != target) { +- dequeue_task(current, array); +- enqueue_task(current, target); +- } else +- /* +- * requeue_task is cheaper so perform that if possible. +- */ +- requeue_task(current, array); ++ if (rq->nr_running == 1) ++ schedstat_inc(rq, yld_both_empty); ++ else { ++ struct prio_array *old_array = p->array; ++ int old_prio = p->prio; ++ ++ if (idleprio_task(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ goto out_release; ++ } ++ /* p->prio will be updated in requeue_task via queue_expired */ ++ if (!rt_task(p)) ++ p->array = rq->expired; ++ requeue_task(p, rq, old_array, old_prio); ++ } + ++out_release: + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: +@@ -4619,6 +4822,8 @@ asmlinkage long sys_sched_get_priority_m + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + break; + } +@@ -4643,6 +4848,8 @@ asmlinkage long sys_sched_get_priority_m + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + } + return ret; +@@ -4676,8 +4883,8 @@ long sys_sched_rr_get_interval(pid_t pid + if (retval) + goto out_unlock; + +- jiffies_to_timespec(p->policy == SCHED_FIFO ? +- 0 : task_timeslice(p), &t); ++ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : ++ MS_TO_NS(task_timeslice(p))); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + out_nounlock: +@@ -4771,10 +4978,10 @@ void __cpuinit init_idle(struct task_str + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + +- idle->timestamp = sched_clock(); +- idle->sleep_avg = 0; +- idle->array = NULL; +- idle->prio = idle->normal_prio = MAX_PRIO; ++ bitmap_zero(idle->bitmap, PRIO_RANGE); ++ idle->timestamp = idle->last_ran = sched_clock(); ++ idle->array = rq->active; ++ idle->prio = idle->normal_prio = NICE_TO_PRIO(0); + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_cpu(idle, cpu); +@@ -4893,7 +5100,7 @@ static int __migrate_task(struct task_st + goto out; + + set_task_cpu(p, dest_cpu); +- if (p->array) { ++ if (task_queued(p)) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step +@@ -4904,8 +5111,7 @@ static int __migrate_task(struct task_st + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + __activate_task(p, rq_dest); +- if (TASK_PREEMPTS_CURR(p, rq_dest)) +- resched_task(rq_dest->curr); ++ try_preempt(p, rq_dest); + } + ret = 1; + out: +@@ -5194,7 +5400,7 @@ migration_call(struct notifier_block *nf + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); +- rq->idle->static_prio = MAX_PRIO; ++ rq->idle->static_prio = NICE_TO_PRIO(0); + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); +@@ -6706,6 +6912,13 @@ void __init sched_init_smp(void) + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); ++ ++ /* ++ * Assume that every added cpu gives us slightly less overall latency ++ * allowing us to increase the base rr_interval, but in a non linear ++ * fashion. ++ */ ++ rr_interval *= 1 + ilog2(num_online_cpus()); + } + #else + void __init sched_init_smp(void) +@@ -6727,6 +6940,16 @@ void __init sched_init(void) + { + int i, j, k; + ++ /* Generate the priority matrix */ ++ for (i = 0; i < PRIO_RANGE; i++) { ++ bitmap_fill(prio_matrix[i], PRIO_RANGE); ++ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); ++ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { ++ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), ++ prio_matrix[i]); ++ } ++ } ++ + for_each_possible_cpu(i) { + struct prio_array *array; + struct rq *rq; +@@ -6734,12 +6957,20 @@ void __init sched_init(void) + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + lockdep_set_class(&rq->lock, &rq->rq_lock_key); ++ rq->iso_ticks = 0; + rq->nr_running = 0; ++ rq->nr_idleprio = 0; ++ rq->prio_rotation = 0; + rq->active = rq->arrays; ++ rq->idleprio = rq->active; + rq->expired = rq->arrays + 1; +- rq->best_expired_prio = MAX_PRIO; ++ reset_prio_levels(rq); ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->exp_bitmap = rq->expired->prio_bitmap; + + #ifdef CONFIG_SMP ++ rq->active->rq = rq; ++ rq->expired->rq = rq; + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; +@@ -6752,16 +6983,16 @@ void __init sched_init(void) + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { ++ + array = rq->arrays + j; +- for (k = 0; k < MAX_PRIO; k++) { ++ for (k = 0; k <= MAX_PRIO; k++) + INIT_LIST_HEAD(array->queue + k); +- __clear_bit(k, array->bitmap); +- } +- // delimiter for bitsearch +- __set_bit(MAX_PRIO, array->bitmap); ++ bitmap_zero(array->prio_bitmap, MAX_PRIO); ++ /* delimiter for bitsearch */ ++ __set_bit(MAX_PRIO, array->prio_bitmap); + } +- } + ++ } + set_load_weight(&init_task); + + #ifdef CONFIG_SMP +@@ -6815,24 +7046,24 @@ EXPORT_SYMBOL(__might_sleep); + #ifdef CONFIG_MAGIC_SYSRQ + void normalize_rt_tasks(void) + { +- struct prio_array *array; + struct task_struct *p; + unsigned long flags; + struct rq *rq; ++ int queued; + + read_lock_irq(&tasklist_lock); + for_each_process(p) { +- if (!rt_task(p)) ++ if (!rt_task(p) && !iso_task(p)) + continue; + + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); + +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); +- if (array) { ++ if (queued) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } +Index: linux-2.6.21-ck1/Documentation/sysctl/kernel.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/sysctl/kernel.txt 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/sysctl/kernel.txt 2007-05-04 12:10:55.000000000 +1000 +@@ -25,6 +25,9 @@ show up in /proc/sys/kernel: + - domainname + - hostname + - hotplug ++- interactive ++- iso_cpu ++- iso_period + - java-appletviewer [ binfmt_java, obsolete ] + - java-interpreter [ binfmt_java, obsolete ] + - kstack_depth_to_print [ X86 only ] +@@ -43,6 +46,7 @@ show up in /proc/sys/kernel: + - printk + - real-root-dev ==> Documentation/initrd.txt + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sem +@@ -164,6 +168,40 @@ Default value is "/sbin/hotplug". + + ============================================================== + ++interactive: ++ ++The staircase-deadline cpu scheduler can be set in either purely ++forward-looking mode for absolutely rigid fairness and cpu distribution ++according to nice level, or it can allow a small per-process history ++to smooth out cpu usage perturbations common in interactive tasks by ++enabling this sysctl. While small fairness issues can arise with this ++enabled, overall fairness is usually still strongly maintained and ++starvation is never possible. Enabling this can significantly smooth ++out 3d graphics and games. ++ ++Default value is 1 (enabled). ++ ++============================================================== ++ ++iso_cpu: ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling iso_period ++seconds. ++ ++Set to 80 (percent) by default. ++ ++============================================================== ++ ++iso_period: ++ ++This sets the number of seconds over which SCHED_ISO cpu usage is averaged ++to see if it exceeds its allocated cpu bandwidth. ++ ++Set to 5 (seconds) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -288,6 +326,19 @@ rebooting. ??? + + ============================================================== + ++rr_interval: ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. This value is in milliseconds and the default value chosen ++depends on the number of cpus available at scheduler initialisation ++with a minimum of 8. ++ ++Valid values are from 1-5000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +Index: linux-2.6.21-ck1/kernel/sysctl.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/sysctl.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/sysctl.c 2007-05-04 12:24:21.000000000 +1000 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -70,12 +71,17 @@ extern int suid_dumpable; + extern char core_pattern[]; + extern int pid_max; + extern int min_free_kbytes; ++extern int vm_tail_largefiles; + extern int printk_ratelimit_jiffies; + extern int printk_ratelimit_burst; + extern int pid_max_min, pid_max_max; + extern int sysctl_drop_caches; + extern int percpu_pagelist_fraction; + extern int compat_log; ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_iso_period; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ + static int maxolduid = 65535; +@@ -159,6 +165,14 @@ int sysctl_legacy_va_layout; + #endif + + ++/* Constants for minimum and maximum testing. ++ We use these as one-element integer vectors. */ ++static int __read_mostly zero; ++static int __read_mostly one = 1; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly five_thousand = 5000; ++ ++ + /* The default sysctl tables: */ + + static ctl_table root_table[] = { +@@ -499,6 +513,47 @@ static ctl_table kern_table[] = { + .mode = 0444, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &five_thousand, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_period", ++ .data = &sched_iso_period, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &one_hundred, ++ }, + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .ctl_name = KERN_UNKNOWN_NMI_PANIC, +@@ -607,12 +662,6 @@ static ctl_table kern_table[] = { + { .ctl_name = 0 } + }; + +-/* Constants for minimum and maximum testing in vm_table. +- We use these as one-element integer vectors. */ +-static int zero; +-static int one_hundred = 100; +- +- + static ctl_table vm_table[] = { + { + .ctl_name = VM_OVERCOMMIT_MEMORY, +@@ -693,16 +742,32 @@ static ctl_table vm_table[] = { + .proc_handler = &proc_dointvec, + }, + { +- .ctl_name = VM_SWAPPINESS, +- .procname = "swappiness", +- .data = &vm_swappiness, +- .maxlen = sizeof(vm_swappiness), ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "mapped", ++ .data = &vm_mapped, ++ .maxlen = sizeof(vm_mapped), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "hardmaplimit", ++ .data = &vm_hardmaplimit, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "tail_largefiles", ++ .data = &vm_tail_largefiles, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, + #ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, +@@ -859,6 +924,16 @@ static ctl_table vm_table[] = { + .extra1 = &zero, + }, + #endif ++#ifdef CONFIG_SWAP_PREFETCH ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch", ++ .data = &swap_prefetch, ++ .maxlen = sizeof(swap_prefetch), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { .ctl_name = 0 } + }; + +Index: linux-2.6.21-ck1/fs/pipe.c +=================================================================== +--- linux-2.6.21-ck1.orig/fs/pipe.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/fs/pipe.c 2007-05-04 12:10:54.000000000 +1000 +@@ -41,12 +41,7 @@ void pipe_wait(struct pipe_inode_info *p + { + DEFINE_WAIT(wait); + +- /* +- * Pipes are system-local resources, so sleeping on them +- * is considered a noninteractive wait: +- */ +- prepare_to_wait(&pipe->wait, &wait, +- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); ++ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + schedule(); +Index: linux-2.6.21-ck1/Documentation/sched-design.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/sched-design.txt 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/sched-design.txt 2007-05-04 12:10:54.000000000 +1000 +@@ -1,11 +1,14 @@ +- Goals, Design and Implementation of the +- new ultra-scalable O(1) scheduler ++ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by ++ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by ++ Con Kolivas. + + +- This is an edited version of an email Ingo Molnar sent to +- lkml on 4 Jan 2002. It describes the goals, design, and +- implementation of Ingo's new ultra-scalable O(1) scheduler. +- Last Updated: 18 April 2002. ++ This was originally an edited version of an email Ingo Molnar sent to ++ lkml on 4 Jan 2002. It describes the goals, design, and implementation ++ of Ingo's ultra-scalable O(1) scheduler. It now contains a description ++ of the Staircase Deadline priority scheduler that was built on this ++ design. ++ Last Updated: Fri, 4 May 2007 + + + Goal +@@ -163,3 +166,222 @@ certain code paths and data constructs. + code is smaller than the old one. + + Ingo ++ ++ ++Staircase Deadline cpu scheduler policy ++================================================ ++ ++Design summary ++============== ++ ++A novel design which incorporates a foreground-background descending priority ++system (the staircase) via a bandwidth allocation matrix according to nice ++level. ++ ++ ++Features ++======== ++ ++A starvation free, strict fairness O(1) scalable design with interactivity ++as good as the above restrictions can provide. There is no interactivity ++estimator, no sleep/run measurements and only simple fixed accounting. ++The design has strict enough a design and accounting that task behaviour ++can be modelled and maximum scheduling latencies can be predicted by ++the virtual deadline mechanism that manages runqueues. The prime concern ++in this design is to maintain fairness at all costs determined by nice level, ++yet to maintain as good interactivity as can be allowed within the ++constraints of strict fairness. ++ ++ ++Design description ++================== ++ ++SD works off the principle of providing each task a quota of runtime that it is ++allowed to run at a number of priority levels determined by its static priority ++(ie. its nice level). If the task uses up its quota it has its priority ++decremented to the next level determined by a priority matrix. Once every ++runtime quota has been consumed of every priority level, a task is queued on the ++"expired" array. When no other tasks exist with quota, the expired array is ++activated and fresh quotas are handed out. This is all done in O(1). ++ ++Design details ++============== ++ ++Each task keeps a record of its own entitlement of cpu time. Most of the rest of ++these details apply to non-realtime tasks as rt task management is straight ++forward. ++ ++Each runqueue keeps a record of what major epoch it is up to in the ++rq->prio_rotation field which is incremented on each major epoch. It also ++keeps a record of the current prio_level for each static priority task. ++ ++Each task keeps a record of what major runqueue epoch it was last running ++on in p->rotation. It also keeps a record of what priority levels it has ++already been allocated quota from during this epoch in a bitmap p->bitmap. ++ ++The only tunable that determines all other details is the RR_INTERVAL. This ++is set to 8ms, and is scaled gently upwards with more cpus. This value is ++tunable via a /proc interface. ++ ++All tasks are initially given a quota based on RR_INTERVAL. This is equal to ++RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and ++progressively larger for nice values from -1 to -20. This is assigned to ++p->quota and only changes with changes in nice level. ++ ++As a task is first queued, it checks in recalc_task_prio to see if it has run at ++this runqueue's current priority rotation. If it has not, it will have its ++p->prio level set according to the first slot in a "priority matrix" and will be ++given a p->time_slice equal to the p->quota, and has its allocation bitmap bit ++set in p->bitmap for this prio level. It is then queued on the current active ++priority array. ++ ++If a task has already been running during this major epoch, and it has ++p->time_slice left and the rq->prio_quota for the task's p->prio still ++has quota, it will be placed back on the active array, but no more quota ++will be added. ++ ++If a task has been running during this major epoch, but does not have ++p->time_slice left, it will find the next lowest priority in its bitmap that it ++has not been allocated quota from. It then gets the a full quota in ++p->time_slice. It is then queued on the current active priority array at the ++newly determined lower priority. ++ ++If a task has been running during this major epoch, and does not have ++any entitlement left in p->bitmap and no time_slice left, it will have its ++bitmap cleared, and be queued at its best prio again, but on the expired ++priority array. ++ ++When a task is queued, it has its relevant bit set in the array->prio_bitmap. ++ ++p->time_slice is stored in nanosconds and is updated via update_cpu_clock on ++schedule() and scheduler_tick. If p->time_slice is below zero then the ++recalc_task_prio is readjusted and the task rescheduled. ++ ++ ++Priority Matrix ++=============== ++ ++In order to minimise the latencies between tasks of different nice levels ++running concurrently, the dynamic priority slots where different nice levels ++are queued are dithered instead of being sequential. What this means is that ++there are 40 priority slots where a task may run during one major rotation, ++and the allocation of slots is dependant on nice level. In the ++following table, a zero represents a slot where the task may run. ++ ++PRIORITY:0..................20.................39 ++nice -20 0000000000000000000000000000000000000000 ++nice -10 1000100010001000100010001000100010010000 ++nice 0 1010101010101010101010101010101010101010 ++nice 5 1011010110110101101101011011010110110110 ++nice 10 1110111011101110111011101110111011101110 ++nice 15 1111111011111110111111101111111011111110 ++nice 19 1111111111111111111111111111111111111110 ++ ++As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 ++task only runs one slot per major rotation. This dithered table allows for the ++smallest possible maximum latencies between tasks of varying nice levels, thus ++allowing vastly different nice levels to be used. ++ ++SCHED_BATCH tasks are managed slightly differently, receiving only the top ++slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but ++slightly higher latencies. ++ ++ ++Modelling deadline behaviour ++============================ ++ ++As the accounting in this design is hard and not modified by sleep average ++calculations or interactivity modifiers, it is possible to accurately ++predict the maximum latency that a task may experience under different ++conditions. This is a virtual deadline mechanism enforced by mandatory ++timeslice expiration and not outside bandwidth measurement. ++ ++The maximum duration a task can run during one major epoch is determined by its ++nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL ++duration during each epoch. Nice 10 tasks can run at 9 priority levels for each ++epoch, and so on. The table in the priority matrix above demonstrates how this ++is enforced. ++ ++Therefore the maximum duration a runqueue epoch can take is determined by ++the number of tasks running, and their nice level. After that, the maximum ++duration it can take before a task can wait before it get scheduled is ++determined by the position of its first slot on the matrix. ++ ++In the following examples, these are _worst case scenarios_ and would rarely ++occur, but can be modelled nonetheless to determine the maximum possible ++latency. ++ ++So for example, if two nice 0 tasks are running, and one has just expired as ++another is activated for the first time receiving a full quota for this ++runqueue rotation, the first task will wait: ++ ++nr_tasks * max_duration + nice_difference * rr_interval ++1 * 19 * RR_INTERVAL + 0 = 152ms ++ ++In the presence of a nice 10 task, a nice 0 task would wait a maximum of ++1 * 10 * RR_INTERVAL + 0 = 80ms ++ ++In the presence of a nice 0 task, a nice 10 task would wait a maximum of ++1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms ++ ++More useful than these values, though, are the average latencies which are ++a matter of determining the average distance between priority slots of ++different nice values and multiplying them by the tasks' quota. For example ++in the presence of a nice -10 task, a nice 0 task will wait either one or ++two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, ++this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or ++20 and 40ms respectively (on uniprocessor at 1000HZ). ++ ++ ++Achieving interactivity ++======================= ++ ++A requirement of this scheduler design was to achieve good interactivity ++despite being a completely fair deadline based design. The disadvantage of ++designs that try to achieve interactivity is that they usually do so at ++the expense of maintaining fairness. As cpu speeds increase, the requirement ++for some sort of metered unfairness towards interactive tasks becomes a less ++desirable phenomenon, but low latency and fairness remains mandatory to ++good interactive performance. ++ ++This design relies on the fact that interactive tasks, by their nature, ++sleep often. Most fair scheduling designs end up penalising such tasks ++indirectly giving them less than their fair possible share because of the ++sleep, and have to use a mechanism of bonusing their priority to offset ++this based on the duration they sleep. This becomes increasingly inaccurate ++as the number of running tasks rises and more tasks spend time waiting on ++runqueues rather than sleeping, and it is impossible to tell whether the ++task that's waiting on a runqueue only intends to run for a short period and ++then sleep again after than runqueue wait. Furthermore, all such designs rely ++on a period of time to pass to accumulate some form of statistic on the task ++before deciding on how much to give them preference. The shorter this period, ++the more rapidly bursts of cpu ruin the interactive tasks behaviour. The ++longer this period, the longer it takes for interactive tasks to get low ++scheduling latencies and fair cpu. ++ ++This design does not measure sleep time at all. Interactive tasks that sleep ++often will wake up having consumed very little if any of their quota for ++the current major priority rotation. The longer they have slept, the less ++likely they are to even be on the current major priority rotation. Once ++woken up, though, they get to use up a their full quota for that epoch, ++whether part of a quota remains or a full quota. Overall, however, they ++can still only run as much cpu time for that epoch as any other task of the ++same nice level. This means that two tasks behaving completely differently ++from fully cpu bound to waking/sleeping extremely frequently will still ++get the same quota of cpu, but the latter will be using its quota for that ++epoch in bursts rather than continuously. This guarantees that interactive ++tasks get the same amount of cpu as cpu bound ones. ++ ++The other requirement of interactive tasks is also to obtain low latencies ++for when they are scheduled. Unlike fully cpu bound tasks and the maximum ++latencies possible described in the modelling deadline behaviour section ++above, tasks that sleep will wake up with quota available usually at the ++current runqueue's priority_level or better. This means that the most latency ++they are likely to see is one RR_INTERVAL, and often they will preempt the ++current task if it is not of a sleeping nature. This then guarantees very ++low latency for interactive tasks, and the lowest latencies for the least ++cpu bound tasks. ++ ++ ++Fri, 4 May 2007 ++Con Kolivas +Index: linux-2.6.21-ck1/kernel/softirq.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/softirq.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/softirq.c 2007-05-04 12:10:54.000000000 +1000 +@@ -488,7 +488,7 @@ void __init softirq_init(void) + + static int ksoftirqd(void * __bind_cpu) + { +- set_user_nice(current, 19); ++ set_user_nice(current, 15); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); +Index: linux-2.6.21-ck1/kernel/fork.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/fork.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/fork.c 2007-05-04 12:24:19.000000000 +1000 +@@ -1060,6 +1060,7 @@ static struct task_struct *copy_process( + p->io_context = NULL; + p->io_wait = NULL; + p->audit_context = NULL; ++ p->mutexes_held = 0; + cpuset_fork(p); + #ifdef CONFIG_NUMA + p->mempolicy = mpol_copy(p->mempolicy); +Index: linux-2.6.21-ck1/kernel/mutex.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/mutex.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/mutex.c 2007-05-04 12:24:19.000000000 +1000 +@@ -60,6 +60,16 @@ EXPORT_SYMBOL(__mutex_init); + static void fastcall noinline __sched + __mutex_lock_slowpath(atomic_t *lock_count); + ++static inline void inc_mutex_count(void) ++{ ++ current->mutexes_held++; ++} ++ ++static inline void dec_mutex_count(void) ++{ ++ current->mutexes_held--; ++} ++ + /*** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired +@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock( + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); ++ inc_mutex_count(); + } + + EXPORT_SYMBOL(mutex_lock); +@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc + * into 'unlocked' state: + */ + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); ++ dec_mutex_count(); + } + + EXPORT_SYMBOL(mutex_unlock); +@@ -283,9 +295,14 @@ __mutex_lock_interruptible_slowpath(atom + */ + int fastcall __sched mutex_lock_interruptible(struct mutex *lock) + { ++ int ret; ++ + might_sleep(); +- return __mutex_fastpath_lock_retval ++ ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_interruptible_slowpath); ++ if (likely(!ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_lock_interruptible); +@@ -340,8 +357,12 @@ static inline int __mutex_trylock_slowpa + */ + int fastcall __sched mutex_trylock(struct mutex *lock) + { +- return __mutex_fastpath_trylock(&lock->count, ++ int ret = __mutex_fastpath_trylock(&lock->count, + __mutex_trylock_slowpath); ++ ++ if (likely(ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_trylock); +Index: linux-2.6.21-ck1/block/cfq-iosched.c +=================================================================== +--- linux-2.6.21-ck1.orig/block/cfq-iosched.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/block/cfq-iosched.c 2007-05-04 12:24:19.000000000 +1000 +@@ -1258,10 +1258,12 @@ static void cfq_init_prio_data(struct cf + printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* +- * no prio set, place us in the middle of the BE classes ++ * Select class and ioprio according to policy and nice + */ ++ cfqq->ioprio_class = task_policy_ioprio_class(tsk); + cfqq->ioprio = task_nice_ioprio(tsk); +- cfqq->ioprio_class = IOPRIO_CLASS_BE; ++ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) ++ cfq_clear_cfqq_idle_window(cfqq); + break; + case IOPRIO_CLASS_RT: + cfqq->ioprio = task_ioprio(tsk); +Index: linux-2.6.21-ck1/include/linux/ioprio.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/ioprio.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/ioprio.h 2007-05-04 12:24:19.000000000 +1000 +@@ -22,7 +22,7 @@ + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +-enum { ++enum ioprio_class { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, +@@ -51,8 +51,25 @@ static inline int task_ioprio(struct tas + return IOPRIO_PRIO_DATA(task->ioprio); + } + ++static inline enum ioprio_class ++ task_policy_ioprio_class(struct task_struct *task) ++{ ++ if (rt_task(task)) ++ return IOPRIO_CLASS_RT; ++ if (idleprio_task(task)) ++ return IOPRIO_CLASS_IDLE; ++ return IOPRIO_CLASS_BE; ++} ++ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (rt_task(task)) ++ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / ++ (MAX_RT_PRIO + 1); ++ if (iso_task(task)) ++ return 0; ++ if (idleprio_task(task)) ++ return IOPRIO_BE_NR - 1; + return (task_nice(task) + 20) / 5; + } + +Index: linux-2.6.21-ck1/Documentation/sysctl/vm.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/sysctl/vm.txt 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/sysctl/vm.txt 2007-05-04 12:24:21.000000000 +1000 +@@ -22,6 +22,8 @@ Currently, these files are in /proc/sys/ + - dirty_background_ratio + - dirty_expire_centisecs + - dirty_writeback_centisecs ++- hardmaplimit ++- mapped + - max_map_count + - min_free_kbytes + - laptop_mode +@@ -31,12 +33,13 @@ Currently, these files are in /proc/sys/ + - min_unmapped_ratio + - min_slab_ratio + - panic_on_oom ++- swap_prefetch + + ============================================================== + + dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, + dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, +-block_dump, swap_token_timeout, drop-caches: ++block_dump, swap_token_timeout, drop-caches, tail_largefiles: + + See Documentation/filesystems/proc.txt + +@@ -86,6 +89,27 @@ for swap because we only cluster swap da + + ============================================================== + ++hardmaplimit: ++ ++This flag makes the vm adhere to the mapped value as closely as possible ++except in the most extreme vm stress where doing so would provoke an out ++of memory condition (see mapped below). ++ ++Enabled by default. ++ ++============================================================== ++ ++mapped: ++ ++This is the percentage ram that is filled with mapped pages (applications) ++before the vm will start reclaiming mapped pages by moving them to swap. ++It is altered by the relative stress of the vm at the time so is not ++strictly adhered to to prevent provoking out of memory kills. ++ ++Set to 66 by default. ++ ++============================================================== ++ + max_map_count: + + This file contains the maximum number of memory map areas a process +@@ -205,3 +229,14 @@ rather than killing rogue processes, set + + The default value is 0. + ++============================================================== ++ ++swap_prefetch ++ ++This enables or disables the swap prefetching feature. When the virtual ++memory subsystem has been extremely idle for at least 5 seconds it will start ++copying back pages from swap into the swapcache and keep a copy in swap. In ++practice it can take many minutes before the vm is idle enough. ++ ++The default value is 1. ++ +Index: linux-2.6.21-ck1/include/linux/swap.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/swap.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/swap.h 2007-05-04 12:24:20.000000000 +1000 +@@ -180,6 +180,7 @@ extern unsigned int nr_free_pagecache_pa + /* linux/mm/swap.c */ + extern void FASTCALL(lru_cache_add(struct page *)); + extern void FASTCALL(lru_cache_add_active(struct page *)); ++extern void FASTCALL(lru_cache_add_tail(struct page *)); + extern void FASTCALL(activate_page(struct page *)); + extern void FASTCALL(mark_page_accessed(struct page *)); + extern void lru_add_drain(void); +@@ -188,9 +189,11 @@ extern int rotate_reclaimable_page(struc + extern void swap_setup(void); + + /* linux/mm/vmscan.c */ +-extern unsigned long try_to_free_pages(struct zone **, gfp_t); ++extern unsigned long try_to_free_pages(struct zone **, gfp_t, ++ struct task_struct *p); + extern unsigned long shrink_all_memory(unsigned long nr_pages); +-extern int vm_swappiness; ++extern int vm_mapped; ++extern int vm_hardmaplimit; + extern int remove_mapping(struct address_space *mapping, struct page *page); + extern long vm_total_pages; + +@@ -237,6 +240,7 @@ extern void free_pages_and_swap_cache(st + extern struct page * lookup_swap_cache(swp_entry_t); + extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, + unsigned long addr); ++extern int add_to_swap_cache(struct page *page, swp_entry_t entry); + /* linux/mm/swapfile.c */ + extern long total_swap_pages; + extern unsigned int nr_swapfiles; +Index: linux-2.6.21-ck1/init/Kconfig +=================================================================== +--- linux-2.6.21-ck1.orig/init/Kconfig 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/init/Kconfig 2007-05-04 12:24:20.000000000 +1000 +@@ -101,6 +101,28 @@ config SWAP + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + ++config SWAP_PREFETCH ++ bool "Support for prefetching swapped memory" ++ depends on SWAP ++ default y ++ ---help--- ++ This option will allow the kernel to prefetch swapped memory pages ++ when idle. The pages will be kept on both swap and in swap_cache ++ thus avoiding the need for further I/O if either ram or swap space ++ is required. ++ ++ What this will do on workstations is slowly bring back applications ++ that have swapped out after memory intensive workloads back into ++ physical ram if you have free ram at a later stage and the machine ++ is relatively idle. This means that when you come back to your ++ computer after leaving it idle for a while, applications will come ++ to life faster. Note that your swap usage will appear to increase ++ but these are cached pages, can be dropped freely by the vm, and it ++ should stabilise around 50% swap usage maximum. ++ ++ Workstations and multiuser workstation servers will most likely want ++ to say Y. ++ + config SYSVIPC + bool "System V IPC" + ---help--- +Index: linux-2.6.21-ck1/mm/Makefile +=================================================================== +--- linux-2.6.21-ck1.orig/mm/Makefile 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/Makefile 2007-05-04 12:24:20.000000000 +1000 +@@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) + obj-y += bounce.o + endif + obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o ++obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o + obj-$(CONFIG_HUGETLBFS) += hugetlb.o + obj-$(CONFIG_NUMA) += mempolicy.o + obj-$(CONFIG_SPARSEMEM) += sparse.o +Index: linux-2.6.21-ck1/mm/swap.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/swap.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/swap.c 2007-05-04 12:24:21.000000000 +1000 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed); + */ + static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; + static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; ++static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; + + void fastcall lru_cache_add(struct page *page) + { +@@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc + put_cpu_var(lru_add_active_pvecs); + } + ++static void __pagevec_lru_add_tail(struct pagevec *pvec) ++{ ++ int i; ++ struct zone *zone = NULL; ++ ++ for (i = 0; i < pagevec_count(pvec); i++) { ++ struct page *page = pvec->pages[i]; ++ struct zone *pagezone = page_zone(page); ++ ++ if (pagezone != zone) { ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ zone = pagezone; ++ spin_lock_irq(&zone->lru_lock); ++ } ++ BUG_ON(PageLRU(page)); ++ SetPageLRU(page); ++ add_page_to_inactive_list_tail(zone, page); ++ } ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ release_pages(pvec->pages, pvec->nr, pvec->cold); ++ pagevec_reinit(pvec); ++} ++ + static void __lru_add_drain(int cpu) + { + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); +@@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu) + pvec = &per_cpu(lru_add_active_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_active(pvec); ++ pvec = &per_cpu(lru_add_tail_pvecs, cpu); ++ if (pagevec_count(pvec)) ++ __pagevec_lru_add_tail(pvec); + } + + void lru_add_drain(void) +@@ -403,6 +433,20 @@ void __pagevec_lru_add_active(struct pag + } + + /* ++ * Function used uniquely to put pages back to the lru at the end of the ++ * inactive list to preserve the lru order. ++ */ ++void fastcall lru_cache_add_tail(struct page *page) ++{ ++ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); ++ ++ page_cache_get(page); ++ if (!pagevec_add(pvec, page)) ++ __pagevec_lru_add_tail(pvec); ++ put_cpu_var(lru_add_pvecs); ++} ++ ++/* + * Try to drop buffers from the pages in a pagevec + */ + void pagevec_strip(struct pagevec *pvec) +@@ -514,6 +558,9 @@ void __init swap_setup(void) + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ ++ ++ prepare_swap_prefetch(); ++ + #ifdef CONFIG_HOTPLUG_CPU + hotcpu_notifier(cpu_swap_callback, 0); + #endif +Index: linux-2.6.21-ck1/mm/swap_prefetch.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.21-ck1/mm/swap_prefetch.c 2007-05-04 12:24:20.000000000 +1000 +@@ -0,0 +1,581 @@ ++/* ++ * linux/mm/swap_prefetch.c ++ * ++ * Copyright (C) 2005-2006 Con Kolivas ++ * ++ * Written by Con Kolivas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Time to delay prefetching if vm is busy or prefetching unsuccessful. There ++ * needs to be at least this duration of idle time meaning in practice it can ++ * be much longer ++ */ ++#define PREFETCH_DELAY (HZ * 5) ++ ++/* sysctl - enable/disable swap prefetching */ ++int swap_prefetch __read_mostly = 1; ++ ++struct swapped_root { ++ unsigned long busy; /* vm busy */ ++ spinlock_t lock; /* protects all data */ ++ struct list_head list; /* MRU list of swapped pages */ ++ struct radix_tree_root swap_tree; /* Lookup tree of pages */ ++ unsigned int count; /* Number of entries */ ++ unsigned int maxcount; /* Maximum entries allowed */ ++ struct kmem_cache *cache; /* Of struct swapped_entry */ ++}; ++ ++static struct swapped_root swapped = { ++ .lock = SPIN_LOCK_UNLOCKED, ++ .list = LIST_HEAD_INIT(swapped.list), ++ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), ++}; ++ ++static struct task_struct *kprefetchd_task; ++ ++/* ++ * We check to see no part of the vm is busy. If it is this will interrupt ++ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. ++ */ ++inline void delay_swap_prefetch(void) ++{ ++ if (!test_bit(0, &swapped.busy)) ++ __set_bit(0, &swapped.busy); ++} ++ ++/* ++ * Drop behind accounting which keeps a list of the most recently used swap ++ * entries. ++ */ ++void add_to_swapped_list(struct page *page) ++{ ++ struct swapped_entry *entry; ++ unsigned long index, flags; ++ int wakeup; ++ ++ if (!swap_prefetch) ++ return; ++ ++ wakeup = 0; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (swapped.count >= swapped.maxcount) { ++ /* ++ * We limit the number of entries to 2/3 of physical ram. ++ * Once the number of entries exceeds this we start removing ++ * the least recently used entries. ++ */ ++ entry = list_entry(swapped.list.next, ++ struct swapped_entry, swapped_list); ++ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); ++ list_del(&entry->swapped_list); ++ swapped.count--; ++ } else { ++ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); ++ if (unlikely(!entry)) ++ /* bad, can't allocate more mem */ ++ goto out_locked; ++ } ++ ++ index = page_private(page); ++ entry->swp_entry.val = index; ++ /* ++ * On numa we need to store the node id to ensure that we prefetch to ++ * the same node it came from. ++ */ ++ store_swap_entry_node(entry, page); ++ ++ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { ++ /* ++ * If this is the first entry, kprefetchd needs to be ++ * (re)started. ++ */ ++ if (!swapped.count) ++ wakeup = 1; ++ list_add(&entry->swapped_list, &swapped.list); ++ swapped.count++; ++ } ++ ++out_locked: ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++ /* Do the wakeup outside the lock to shorten lock hold time. */ ++ if (wakeup) ++ wake_up_process(kprefetchd_task); ++ ++ return; ++} ++ ++/* ++ * Removes entries from the swapped_list. The radix tree allows us to quickly ++ * look up the entry from the index without having to iterate over the whole ++ * list. ++ */ ++void remove_from_swapped_list(const unsigned long index) ++{ ++ struct swapped_entry *entry; ++ unsigned long flags; ++ ++ if (list_empty(&swapped.list)) ++ return; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ entry = radix_tree_delete(&swapped.swap_tree, index); ++ if (likely(entry)) { ++ list_del_init(&entry->swapped_list); ++ swapped.count--; ++ kmem_cache_free(swapped.cache, entry); ++ } ++ spin_unlock_irqrestore(&swapped.lock, flags); ++} ++ ++enum trickle_return { ++ TRICKLE_SUCCESS, ++ TRICKLE_FAILED, ++ TRICKLE_DELAY, ++}; ++ ++struct node_stats { ++ unsigned long last_free; ++ /* Free ram after a cycle of prefetching */ ++ unsigned long current_free; ++ /* Free ram on this cycle of checking prefetch_suitable */ ++ unsigned long prefetch_watermark; ++ /* Maximum amount we will prefetch to */ ++ unsigned long highfree[MAX_NR_ZONES]; ++ /* The amount of free ram before we start prefetching */ ++ unsigned long lowfree[MAX_NR_ZONES]; ++ /* The amount of free ram where we will stop prefetching */ ++ unsigned long *pointfree[MAX_NR_ZONES]; ++ /* highfree or lowfree depending on whether we've hit a watermark */ ++}; ++ ++/* ++ * prefetch_stats stores the free ram data of each node and this is used to ++ * determine if a node is suitable for prefetching into. ++ */ ++struct prefetch_stats { ++ nodemask_t prefetch_nodes; ++ /* Which nodes are currently suited to prefetching */ ++ unsigned long prefetched_pages; ++ /* Total pages we've prefetched on this wakeup of kprefetchd */ ++ struct node_stats node[MAX_NUMNODES]; ++}; ++ ++static struct prefetch_stats sp_stat; ++ ++/* ++ * This tries to read a swp_entry_t into swap cache for swap prefetching. ++ * If it returns TRICKLE_DELAY we should delay further prefetching. ++ */ ++static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, ++ const int node) ++{ ++ enum trickle_return ret = TRICKLE_FAILED; ++ struct page *page; ++ ++ read_lock_irq(&swapper_space.tree_lock); ++ /* Entry may already exist */ ++ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); ++ read_unlock_irq(&swapper_space.tree_lock); ++ if (page) { ++ remove_from_swapped_list(entry.val); ++ goto out; ++ } ++ ++ /* ++ * Get a new page to read from swap. We have already checked the ++ * watermarks so __alloc_pages will not call on reclaim. ++ */ ++ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); ++ if (unlikely(!page)) { ++ ret = TRICKLE_DELAY; ++ goto out; ++ } ++ ++ if (add_to_swap_cache(page, entry)) { ++ /* Failed to add to swap cache */ ++ goto out_release; ++ } ++ ++ /* Add them to the tail of the inactive list to preserve LRU order */ ++ lru_cache_add_tail(page); ++ if (unlikely(swap_readpage(NULL, page))) { ++ ret = TRICKLE_DELAY; ++ goto out_release; ++ } ++ ++ sp_stat.prefetched_pages++; ++ sp_stat.node[node].last_free--; ++ ++ ret = TRICKLE_SUCCESS; ++out_release: ++ page_cache_release(page); ++out: ++ return ret; ++} ++ ++static void clear_last_prefetch_free(void) ++{ ++ int node; ++ ++ /* ++ * Reset the nodes suitable for prefetching to all nodes. We could ++ * update the data to take into account memory hotplug if desired.. ++ */ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->last_free = 0; ++ } ++} ++ ++static void clear_current_prefetch_free(void) ++{ ++ int node; ++ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->current_free = 0; ++ } ++} ++ ++/* ++ * This updates the high and low watermarks of amount of free ram in each ++ * node used to start and stop prefetching. We prefetch from pages_high * 4 ++ * down to pages_high * 3. ++ */ ++static void examine_free_limits(void) ++{ ++ struct zone *z; ++ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ ns = &sp_stat.node[z->zone_pgdat->node_id]; ++ idx = zone_idx(z); ++ ns->lowfree[idx] = z->pages_high * 3; ++ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; ++ ++ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { ++ /* ++ * We've gotten above the high watermark of free pages ++ * so we can start prefetching till we get to the low ++ * watermark. ++ */ ++ ns->pointfree[idx] = &ns->lowfree[idx]; ++ } ++ } ++} ++ ++/* ++ * We want to be absolutely certain it's ok to start prefetching. ++ */ ++static int prefetch_suitable(void) ++{ ++ unsigned long limit; ++ struct zone *z; ++ int node, ret = 0, test_pagestate = 0; ++ ++ /* Purposefully racy */ ++ if (test_bit(0, &swapped.busy)) { ++ __clear_bit(0, &swapped.busy); ++ goto out; ++ } ++ ++ /* ++ * get_page_state and above_background_load are expensive so we only ++ * perform them every SWAP_CLUSTER_MAX prefetched_pages. ++ * We test to see if we're above_background_load as disk activity ++ * even at low priority can cause interrupt induced scheduling ++ * latencies. ++ */ ++ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { ++ if (above_background_load()) ++ goto out; ++ test_pagestate = 1; ++ } ++ ++ clear_current_prefetch_free(); ++ ++ /* ++ * Have some hysteresis between where page reclaiming and prefetching ++ * will occur to prevent ping-ponging between them. ++ */ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ unsigned long free; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ node = z->zone_pgdat->node_id; ++ ns = &sp_stat.node[node]; ++ idx = zone_idx(z); ++ ++ free = zone_page_state(z, NR_FREE_PAGES); ++ if (free < *ns->pointfree[idx]) { ++ /* ++ * Free pages have dropped below the low watermark so ++ * we won't start prefetching again till we hit the ++ * high watermark of free pages. ++ */ ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ ns->current_free += free; ++ } ++ ++ /* ++ * We iterate over each node testing to see if it is suitable for ++ * prefetching and clear the nodemask if it is not. ++ */ ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ /* ++ * We check to see that pages are not being allocated ++ * elsewhere at any significant rate implying any ++ * degree of memory pressure (eg during file reads) ++ */ ++ if (ns->last_free) { ++ if (ns->current_free + SWAP_CLUSTER_MAX < ++ ns->last_free) { ++ ns->last_free = ns->current_free; ++ node_clear(node, ++ sp_stat.prefetch_nodes); ++ continue; ++ } ++ } else ++ ns->last_free = ns->current_free; ++ ++ if (!test_pagestate) ++ continue; ++ ++ /* We shouldn't prefetch when we are doing writeback */ ++ if (node_page_state(node, NR_WRITEBACK)) { ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ ++ /* ++ * >2/3 of the ram on this node is mapped, slab, swapcache or ++ * dirty, we need to leave some free for pagecache. ++ */ ++ limit = node_page_state(node, NR_FILE_PAGES); ++ limit += node_page_state(node, NR_SLAB_RECLAIMABLE); ++ limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE); ++ limit += node_page_state(node, NR_FILE_DIRTY); ++ limit += node_page_state(node, NR_UNSTABLE_NFS); ++ limit += total_swapcache_pages; ++ if (limit > ns->prefetch_watermark) { ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ } ++ ++ if (nodes_empty(sp_stat.prefetch_nodes)) ++ goto out; ++ ++ /* Survived all that? Hooray we can prefetch! */ ++ ret = 1; ++out: ++ return ret; ++} ++ ++/* ++ * Get previous swapped entry when iterating over all entries. swapped.lock ++ * should be held and we should already ensure that entry exists. ++ */ ++static inline struct swapped_entry *prev_swapped_entry ++ (struct swapped_entry *entry) ++{ ++ return list_entry(entry->swapped_list.prev->prev, ++ struct swapped_entry, swapped_list); ++} ++ ++/* ++ * trickle_swap is the main function that initiates the swap prefetching. It ++ * first checks to see if the busy flag is set, and does not prefetch if it ++ * is, as the flag implied we are low on memory or swapping in currently. ++ * Otherwise it runs until prefetch_suitable fails which occurs when the ++ * vm is busy, we prefetch to the watermark, or the list is empty or we have ++ * iterated over all entries ++ */ ++static enum trickle_return trickle_swap(void) ++{ ++ enum trickle_return ret = TRICKLE_DELAY; ++ struct swapped_entry *entry; ++ unsigned long flags; ++ ++ /* ++ * If laptop_mode is enabled don't prefetch to avoid hard drives ++ * doing unnecessary spin-ups ++ */ ++ if (!swap_prefetch || laptop_mode) ++ return ret; ++ ++ examine_free_limits(); ++ entry = NULL; ++ ++ for ( ; ; ) { ++ swp_entry_t swp_entry; ++ int node; ++ ++ if (!prefetch_suitable()) ++ break; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (list_empty(&swapped.list)) { ++ ret = TRICKLE_FAILED; ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ break; ++ } ++ ++ if (!entry) { ++ /* ++ * This sets the entry for the first iteration. It ++ * also is a safeguard against the entry disappearing ++ * while the lock is not held. ++ */ ++ entry = list_entry(swapped.list.prev, ++ struct swapped_entry, swapped_list); ++ } else if (entry->swapped_list.prev == swapped.list.next) { ++ /* ++ * If we have iterated over all entries and there are ++ * still entries that weren't swapped out there may ++ * be a reason we could not swap them back in so ++ * delay attempting further prefetching. ++ */ ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ break; ++ } ++ ++ node = get_swap_entry_node(entry); ++ if (!node_isset(node, sp_stat.prefetch_nodes)) { ++ /* ++ * We found an entry that belongs to a node that is ++ * not suitable for prefetching so skip it. ++ */ ++ entry = prev_swapped_entry(entry); ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ continue; ++ } ++ swp_entry = entry->swp_entry; ++ entry = prev_swapped_entry(entry); ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) ++ break; ++ } ++ ++ if (sp_stat.prefetched_pages) { ++ lru_add_drain(); ++ sp_stat.prefetched_pages = 0; ++ } ++ return ret; ++} ++ ++static int kprefetchd(void *__unused) ++{ ++ struct sched_param param = { .sched_priority = 0 }; ++ ++ sched_setscheduler(current, SCHED_BATCH, ¶m); ++ set_user_nice(current, 19); ++ /* Set ioprio to lowest if supported by i/o scheduler */ ++ sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); ++ ++ /* kprefetchd has nothing to do until it is woken up the first time */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ ++ do { ++ try_to_freeze(); ++ ++ /* ++ * TRICKLE_FAILED implies no entries left - we do not schedule ++ * a wakeup, and further delay the next one. ++ */ ++ if (trickle_swap() == TRICKLE_FAILED) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ } ++ clear_last_prefetch_free(); ++ schedule_timeout_interruptible(PREFETCH_DELAY); ++ } while (!kthread_should_stop()); ++ ++ return 0; ++} ++ ++/* ++ * Create kmem cache for swapped entries ++ */ ++void __init prepare_swap_prefetch(void) ++{ ++ struct zone *zone; ++ ++ swapped.cache = kmem_cache_create("swapped_entry", ++ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); ++ ++ /* ++ * Set max number of entries to 2/3 the size of physical ram as we ++ * only ever prefetch to consume 2/3 of the ram. ++ */ ++ swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; ++ ++ for_each_zone(zone) { ++ unsigned long present; ++ struct node_stats *ns; ++ int idx; ++ ++ present = zone->present_pages; ++ if (!present) ++ continue; ++ ++ ns = &sp_stat.node[zone->zone_pgdat->node_id]; ++ ns->prefetch_watermark += present / 3 * 2; ++ idx = zone_idx(zone); ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ } ++} ++ ++static int __init kprefetchd_init(void) ++{ ++ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); ++ ++ return 0; ++} ++ ++static void __exit kprefetchd_exit(void) ++{ ++ kthread_stop(kprefetchd_task); ++} ++ ++module_init(kprefetchd_init); ++module_exit(kprefetchd_exit); +Index: linux-2.6.21-ck1/mm/swap_state.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/swap_state.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/swap_state.c 2007-05-04 12:24:20.000000000 +1000 +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -82,6 +83,7 @@ static int __add_to_swap_cache(struct pa + error = radix_tree_insert(&swapper_space.page_tree, + entry.val, page); + if (!error) { ++ remove_from_swapped_list(entry.val); + page_cache_get(page); + SetPageLocked(page); + SetPageSwapCache(page); +@@ -95,11 +97,12 @@ static int __add_to_swap_cache(struct pa + return error; + } + +-static int add_to_swap_cache(struct page *page, swp_entry_t entry) ++int add_to_swap_cache(struct page *page, swp_entry_t entry) + { + int error; + + if (!swap_duplicate(entry)) { ++ remove_from_swapped_list(entry.val); + INC_CACHE_INFO(noent_race); + return -ENOENT; + } +@@ -148,6 +151,9 @@ int add_to_swap(struct page * page, gfp_ + swp_entry_t entry; + int err; + ++ /* Swap prefetching is delayed if we're swapping pages */ ++ delay_swap_prefetch(); ++ + BUG_ON(!PageLocked(page)); + + for (;;) { +@@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e + struct page *found_page, *new_page = NULL; + int err; + ++ /* Swap prefetching is delayed if we're already reading from swap */ ++ delay_swap_prefetch(); ++ + do { + /* + * First check the swap cache. Since this is normally +Index: linux-2.6.21-ck1/mm/vmscan.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/vmscan.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/vmscan.c 2007-05-04 12:24:21.000000000 +1000 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -36,6 +37,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -63,7 +65,7 @@ struct scan_control { + * whole list at once. */ + int swap_cluster_max; + +- int swappiness; ++ int mapped; + + int all_unreclaimable; + }; +@@ -110,9 +112,10 @@ struct shrinker { + #endif + + /* +- * From 0 .. 100. Higher means more swappy. ++ * From 0 .. 100. Lower means more swappy. + */ +-int vm_swappiness = 60; ++int vm_mapped __read_mostly = 66; ++int vm_hardmaplimit __read_mostly = 1; + long vm_total_pages; /* The total number of pages which the VM controls */ + + static LIST_HEAD(shrinker_list); +@@ -424,6 +427,7 @@ int remove_mapping(struct address_space + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; ++ add_to_swapped_list(page); + __delete_from_swap_cache(page); + write_unlock_irq(&mapping->tree_lock); + swap_free(swap); +@@ -807,10 +811,14 @@ static void shrink_active_list(unsigned + * The distress ratio is important - we don't want to start + * going oom. + * +- * A 100% value of vm_swappiness overrides this algorithm +- * altogether. ++ * This distress value is ignored if we apply a hardmaplimit except ++ * in extreme distress. ++ * ++ * A 0% value of vm_mapped overrides this algorithm altogether. + */ +- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; ++ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); ++ if (!vm_hardmaplimit || distress == 100) ++ swap_tendency += distress; + + /* + * Now use this metric to decide whether to start moving mapped +@@ -959,6 +967,41 @@ static unsigned long shrink_zone(int pri + } + + /* ++ * Helper functions to adjust nice level of kswapd, based on the priority of ++ * the task (p) that called it. If it is already higher priority we do not ++ * demote its nice level since it is still working on behalf of a higher ++ * priority task. With kernel threads we leave it at nice 0. ++ * ++ * We don't ever run kswapd real time, so if a real time task calls kswapd we ++ * set it to highest SCHED_NORMAL priority. ++ */ ++static int effective_sc_prio(struct task_struct *p) ++{ ++ if (likely(p->mm)) { ++ if (rt_task(p)) ++ return -20; ++ if (idleprio_task(p)) ++ return 19; ++ return task_nice(p); ++ } ++ return 0; ++} ++ ++static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, ++ int active) ++{ ++ long nice = effective_sc_prio(p); ++ ++ if (task_nice(kswapd) > nice || !active) ++ set_user_nice(kswapd, nice); ++} ++ ++static int sc_priority(struct task_struct *p) ++{ ++ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); ++} ++ ++/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. +@@ -1015,7 +1058,8 @@ static unsigned long shrink_zones(int pr + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + */ +-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) ++unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ++ struct task_struct *p) + { + int priority; + int ret = 0; +@@ -1023,15 +1067,20 @@ unsigned long try_to_free_pages(struct z + unsigned long nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages = 0; +- int i; ++ int i, scan_priority = DEF_PRIORITY; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + ++ if (p) ++ scan_priority = sc_priority(p); ++ ++ delay_swap_prefetch(); ++ + count_vm_event(ALLOCSTALL); + + for (i = 0; zones[i] != NULL; i++) { +@@ -1044,7 +1093,7 @@ unsigned long try_to_free_pages(struct z + + zone_page_state(zone, NR_INACTIVE); + } + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + sc.nr_scanned = 0; + if (!priority) + disable_swap_token(); +@@ -1074,7 +1123,7 @@ unsigned long try_to_free_pages(struct z + } + + /* Take a nap, wait for some writeback to complete */ +- if (sc.nr_scanned && priority < DEF_PRIORITY - 2) ++ if (sc.nr_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + } + /* top priority shrink_caches still had more to do? don't OOM, then */ +@@ -1124,9 +1173,9 @@ out: + */ + static unsigned long balance_pgdat(pg_data_t *pgdat, int order) + { +- int all_zones_ok; ++ int all_zones_ok = 0; + int priority; +- int i; ++ int i, scan_priority; + unsigned long total_scanned; + unsigned long nr_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; +@@ -1134,7 +1183,7 @@ static unsigned long balance_pgdat(pg_da + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + /* + * temp_priority is used to remember the scanning priority at which +@@ -1142,6 +1191,8 @@ static unsigned long balance_pgdat(pg_da + */ + int temp_priority[MAX_NR_ZONES]; + ++ scan_priority = sc_priority(pgdat->kswapd); ++ + loop_again: + total_scanned = 0; + nr_reclaimed = 0; +@@ -1149,9 +1200,9 @@ loop_again: + count_vm_event(PAGEOUTRUN); + + for (i = 0; i < pgdat->nr_zones; i++) +- temp_priority[i] = DEF_PRIORITY; ++ temp_priority[i] = scan_priority; + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + +@@ -1167,15 +1218,22 @@ loop_again: + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, +- 0, 0)) { ++ /* ++ * The watermark is relaxed depending on the ++ * level of "priority" till it drops to ++ * pages_high. ++ */ ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { + end_zone = i; + break; + } +@@ -1202,14 +1260,18 @@ loop_again: + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int nr_slab; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ ++ if (!zone_watermark_ok(zone, order, watermark, + end_zone, 0)) + all_zones_ok = 0; + temp_priority[i] = priority; +@@ -1242,7 +1304,7 @@ loop_again: + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ +- if (total_scanned && priority < DEF_PRIORITY - 2) ++ if (total_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + + /* +@@ -1276,6 +1338,8 @@ out: + return nr_reclaimed; + } + ++#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ ++ + /* + * The background pageout daemon, started as a kernel thread + * from the init process. +@@ -1325,6 +1389,8 @@ static int kswapd(void *p) + + try_to_freeze(); + ++ /* kswapd has been busy so delay watermark_timer */ ++ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; +@@ -1335,6 +1401,7 @@ static int kswapd(void *p) + */ + order = new_order; + } else { ++ set_user_nice(tsk, 0); + schedule(); + order = pgdat->kswapd_max_order; + } +@@ -1348,9 +1415,10 @@ static int kswapd(void *p) + /* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +-void wakeup_kswapd(struct zone *zone, int order) ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) + { + pg_data_t *pgdat; ++ int active; + + if (!populated_zone(zone)) + return; +@@ -1362,7 +1430,9 @@ void wakeup_kswapd(struct zone *zone, in + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; +- if (!waitqueue_active(&pgdat->kswapd_wait)) ++ active = waitqueue_active(&pgdat->kswapd_wait); ++ set_kswapd_nice(pgdat->kswapd, p, active); ++ if (!active) + return; + wake_up_interruptible(&pgdat->kswapd_wait); + } +@@ -1381,6 +1451,8 @@ static unsigned long shrink_all_zones(un + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + ++ delay_swap_prefetch(); ++ + for_each_zone(zone) { + + if (!populated_zone(zone)) +@@ -1440,7 +1512,7 @@ unsigned long shrink_all_memory(unsigned + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + + current->reclaim_state = &reclaim_state; +@@ -1475,7 +1547,7 @@ unsigned long shrink_all_memory(unsigned + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; +- sc.swappiness = 100; ++ sc.mapped = 0; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { +@@ -1539,20 +1611,57 @@ static int __devinit cpu_callback(struct + } + + /* ++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots ++ */ ++static void watermark_wakeup(unsigned long data) ++{ ++ pg_data_t *pgdat = (pg_data_t *)data; ++ struct timer_list *wt = &pgdat->watermark_timer; ++ int i; ++ ++ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) ++ goto out; ++ for (i = pgdat->nr_zones - 1; i >= 0; i--) { ++ struct zone *z = pgdat->node_zones + i; ++ ++ if (!populated_zone(z) || is_highmem(z)) { ++ /* We are better off leaving highmem full */ ++ continue; ++ } ++ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { ++ wake_up_interruptible(&pgdat->kswapd_wait); ++ goto out; ++ } ++ } ++out: ++ mod_timer(wt, jiffies + WT_EXPIRY); ++ return; ++} ++ ++/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ + int kswapd_run(int nid) + { + pg_data_t *pgdat = NODE_DATA(nid); ++ struct timer_list *wt; + int ret = 0; + + if (pgdat->kswapd) + return 0; + ++ wt = &pgdat->watermark_timer; ++ init_timer(wt); ++ wt->data = (unsigned long)pgdat; ++ wt->function = watermark_wakeup; ++ wt->expires = jiffies + WT_EXPIRY; ++ add_timer(wt); ++ + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ ++ del_timer(wt); + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; +@@ -1623,7 +1732,7 @@ static int __zone_reclaim(struct zone *z + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + unsigned long slab_reclaimable; + +Index: linux-2.6.21-ck1/include/linux/mm_inline.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/mm_inline.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/mm_inline.h 2007-05-04 12:24:20.000000000 +1000 +@@ -13,6 +13,13 @@ add_page_to_inactive_list(struct zone *z + } + + static inline void ++add_page_to_inactive_list_tail(struct zone *zone, struct page *page) ++{ ++ list_add_tail(&page->lru, &zone->inactive_list); ++ __inc_zone_state(zone, NR_INACTIVE); ++} ++ ++static inline void + del_page_from_active_list(struct zone *zone, struct page *page) + { + list_del(&page->lru); +Index: linux-2.6.21-ck1/include/linux/swap-prefetch.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.21-ck1/include/linux/swap-prefetch.h 2007-05-04 12:24:20.000000000 +1000 +@@ -0,0 +1,55 @@ ++#ifndef SWAP_PREFETCH_H_INCLUDED ++#define SWAP_PREFETCH_H_INCLUDED ++ ++#ifdef CONFIG_SWAP_PREFETCH ++/* mm/swap_prefetch.c */ ++extern int swap_prefetch; ++struct swapped_entry { ++ swp_entry_t swp_entry; /* The actual swap entry */ ++ struct list_head swapped_list; /* Linked list of entries */ ++#if MAX_NUMNODES > 1 ++ int node; /* Node id */ ++#endif ++} __attribute__((packed)); ++ ++static inline void store_swap_entry_node(struct swapped_entry *entry, ++ struct page *page) ++{ ++#if MAX_NUMNODES > 1 ++ entry->node = page_to_nid(page); ++#endif ++} ++ ++static inline int get_swap_entry_node(struct swapped_entry *entry) ++{ ++#if MAX_NUMNODES > 1 ++ return entry->node; ++#else ++ return 0; ++#endif ++} ++ ++extern void add_to_swapped_list(struct page *page); ++extern void remove_from_swapped_list(const unsigned long index); ++extern void delay_swap_prefetch(void); ++extern void prepare_swap_prefetch(void); ++ ++#else /* CONFIG_SWAP_PREFETCH */ ++static inline void add_to_swapped_list(struct page *__unused) ++{ ++} ++ ++static inline void prepare_swap_prefetch(void) ++{ ++} ++ ++static inline void remove_from_swapped_list(const unsigned long __unused) ++{ ++} ++ ++static inline void delay_swap_prefetch(void) ++{ ++} ++#endif /* CONFIG_SWAP_PREFETCH */ ++ ++#endif /* SWAP_PREFETCH_H_INCLUDED */ +Index: linux-2.6.21-ck1/include/linux/sysctl.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/sysctl.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/sysctl.h 2007-05-04 12:24:20.000000000 +1000 +@@ -190,7 +190,7 @@ enum + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ +- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ ++ VM_MAPPED=19, /* percent mapped min while evicting cache */ + VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ + VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ +Index: linux-2.6.21-ck1/include/linux/mmzone.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/mmzone.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/mmzone.h 2007-05-04 12:24:21.000000000 +1000 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -178,7 +179,7 @@ enum zone_type { + + struct zone { + /* Fields commonly accessed by the page allocator */ +- unsigned long pages_min, pages_low, pages_high; ++ unsigned long pages_min, pages_low, pages_high, pages_lots; + /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several +@@ -449,6 +450,7 @@ typedef struct pglist_data { + wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; + int kswapd_max_order; ++ struct timer_list watermark_timer; + } pg_data_t; + + #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) +@@ -465,7 +467,7 @@ typedef struct pglist_data { + void get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free); + void build_all_zonelists(void); +-void wakeup_kswapd(struct zone *zone, int order); ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); + int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); + enum memmap_context { +Index: linux-2.6.21-ck1/mm/page_alloc.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/page_alloc.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/page_alloc.c 2007-05-04 12:24:20.000000000 +1000 +@@ -1277,7 +1277,7 @@ restart: + goto nopage; + + for (z = zonelist->zones; *z; z++) +- wakeup_kswapd(*z, order); ++ wakeup_kswapd(*z, order, p); + + /* + * OK, we're below the kswapd watermark and have kicked background +@@ -1341,7 +1341,7 @@ nofail_alloc: + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + +- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); ++ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; +@@ -1597,6 +1597,7 @@ void show_free_areas(void) + " min:%lukB" + " low:%lukB" + " high:%lukB" ++ " lots:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" +@@ -1608,6 +1609,7 @@ void show_free_areas(void) + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), ++ K(zone->pages_lots), + K(zone_page_state(zone, NR_ACTIVE)), + K(zone_page_state(zone, NR_INACTIVE)), + K(zone->present_pages), +@@ -3146,6 +3148,7 @@ void setup_per_zone_pages_min(void) + + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); ++ zone->pages_lots = zone->pages_min + tmp; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + +Index: linux-2.6.21-ck1/fs/buffer.c +=================================================================== +--- linux-2.6.21-ck1.orig/fs/buffer.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/fs/buffer.c 2007-05-04 12:24:20.000000000 +1000 +@@ -363,7 +363,7 @@ static void free_more_memory(void) + for_each_online_pgdat(pgdat) { + zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; + if (*zones) +- try_to_free_pages(zones, GFP_NOFS); ++ try_to_free_pages(zones, GFP_NOFS, NULL); + } + } + +Index: linux-2.6.21-ck1/mm/filemap.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/filemap.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/filemap.c 2007-05-04 12:24:21.000000000 +1000 +@@ -466,6 +466,16 @@ int add_to_page_cache_lru(struct page *p + return ret; + } + ++int add_to_page_cache_lru_tail(struct page *page, ++ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache(page, mapping, offset, gfp_mask); ++ ++ if (ret == 0) ++ lru_cache_add_tail(page); ++ return ret; ++} ++ + #ifdef CONFIG_NUMA + struct page *__page_cache_alloc(gfp_t gfp) + { +@@ -836,6 +846,34 @@ static void shrink_readahead_size_eio(st + ra->ra_pages /= 4; + } + ++/* ++ * Sysctl which determines whether we should read from large files to the ++ * tail of the inactive lru list. ++ */ ++int vm_tail_largefiles __read_mostly = 1; ++ ++static inline int nr_mapped(void) ++{ ++ return global_page_state(NR_FILE_MAPPED) + ++ global_page_state(NR_ANON_PAGES); ++} ++ ++/* ++ * This examines how large in pages a file size is and returns 1 if it is ++ * more than half the unmapped ram. Avoid doing read_page_state which is ++ * expensive unless we already know it is likely to be large enough. ++ */ ++static int large_isize(unsigned long nr_pages) ++{ ++ if (nr_pages * 6 > vm_total_pages) { ++ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); ++ ++ if (nr_pages * 2 > unmapped_ram) ++ return 1; ++ } ++ return 0; ++} ++ + /** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read +@@ -1044,8 +1082,19 @@ no_cached_page: + goto out; + } + } +- error = add_to_page_cache_lru(cached_page, mapping, +- index, GFP_KERNEL); ++ ++ /* ++ * If we know the file is large we add the pages read to the ++ * end of the lru as we're unlikely to be able to cache the ++ * whole file in ram so make those pages the first to be ++ * dropped if not referenced soon. ++ */ ++ if (vm_tail_largefiles && large_isize(end_index)) ++ error = add_to_page_cache_lru_tail(cached_page, ++ mapping, index, GFP_KERNEL); ++ else ++ error = add_to_page_cache_lru(cached_page, mapping, ++ index, GFP_KERNEL); + if (error) { + if (error == -EEXIST) + goto find_page; +Index: linux-2.6.21-ck1/Documentation/filesystems/proc.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/filesystems/proc.txt 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/filesystems/proc.txt 2007-05-04 12:24:21.000000000 +1000 +@@ -1325,6 +1325,14 @@ To free pagecache, dentries and inodes: + As this is a non-destructive operation and dirty objects are not freeable, the + user should run `sync' first. + ++tail_largefiles ++--------------- ++ ++When enabled reads from large files to the tail end of the inactive lru list. ++This means that any cache from reading large files is dropped very quickly, ++preventing loss of mapped ram and useful pagecache when large files are read. ++This does, however, make caching less effective when working with large files. ++ + + 2.5 /proc/sys/dev - Device specific parameters + ---------------------------------------------- +Index: linux-2.6.21-ck1/arch/i386/Kconfig +=================================================================== +--- linux-2.6.21-ck1.orig/arch/i386/Kconfig 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/i386/Kconfig 2007-05-04 12:24:21.000000000 +1000 +@@ -546,7 +546,7 @@ endchoice + + choice + depends on EXPERIMENTAL +- prompt "Memory split" if EMBEDDED ++ prompt "Memory split" + default VMSPLIT_3G + help + Select the desired split between kernel and user memory. +@@ -565,14 +565,14 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !HIGHMEM +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +Index: linux-2.6.21-ck1/kernel/Kconfig.hz +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/Kconfig.hz 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/Kconfig.hz 2007-05-04 12:24:21.000000000 +1000 +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_1000 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -13,8 +13,7 @@ choice + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts +- per second. +- ++ per second.Laptops may also show improved battery life. + + config HZ_100 + bool "100 HZ" +@@ -23,13 +22,14 @@ choice + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEFAULT + bool "250 HZ" + help +- 250 Hz is a good compromise choice allowing server performance +- while also showing good interactive responsiveness even +- on SMP and NUMA systems. If you are going to be using NTSC video +- or multimedia, selected 300Hz instead. ++ 250 HZ is a lousy compromise choice allowing server interactivity ++ while also showing desktop throughput and no extra power saving on ++ laptops. Good for when you can't make up your mind. ++ ++ Recommend 100 or 1000 instead. + + config HZ_300 + bool "300 HZ" +@@ -45,12 +45,76 @@ choice + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + ++ config HZ_1500 ++ bool "1500 HZ" ++ help ++ 1500 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_2000 ++ bool "2000 HZ" ++ help ++ 2000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_3000 ++ bool "3000 HZ" ++ help ++ 3000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_4000 ++ bool "4000 HZ" ++ help ++ 4000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_5000 ++ bool "5000 HZ" ++ help ++ 5000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_7500 ++ bool "7500 HZ" ++ help ++ 7500 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_10000 ++ bool "10000 HZ" ++ help ++ 10000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ + endchoice + + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 ++ default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 1500 if HZ_1500 ++ default 2000 if HZ_2000 ++ default 3000 if HZ_3000 ++ default 4000 if HZ_4000 ++ default 5000 if HZ_5000 ++ default 7500 if HZ_7500 ++ default 10000 if HZ_10000 + +Index: linux-2.6.21-ck1/arch/i386/defconfig +=================================================================== +--- linux-2.6.21-ck1.orig/arch/i386/defconfig 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/i386/defconfig 2007-05-04 12:24:21.000000000 +1000 +@@ -214,10 +214,10 @@ CONFIG_MTRR=y + # CONFIG_IRQBALANCE is not set + CONFIG_SECCOMP=y + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + # CONFIG_KEXEC is not set + # CONFIG_CRASH_DUMP is not set + CONFIG_PHYSICAL_START=0x100000 +Index: linux-2.6.21-ck1/arch/x86_64/defconfig +=================================================================== +--- linux-2.6.21-ck1.orig/arch/x86_64/defconfig 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/x86_64/defconfig 2007-05-04 12:24:21.000000000 +1000 +@@ -178,10 +178,10 @@ CONFIG_PHYSICAL_START=0x200000 + CONFIG_SECCOMP=y + # CONFIG_CC_STACKPROTECTOR is not set + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + # CONFIG_REORDER is not set + CONFIG_K8_NB=y + CONFIG_GENERIC_HARDIRQS=y +Index: linux-2.6.21-ck1/include/linux/jiffies.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/jiffies.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/jiffies.h 2007-05-04 12:24:21.000000000 +1000 +@@ -29,6 +29,12 @@ + # define SHIFT_HZ 9 + #elif HZ >= 768 && HZ < 1536 + # define SHIFT_HZ 10 ++#elif HZ >= 1536 && HZ < 3072 ++# define SHIFT_HZ 11 ++#elif HZ >= 3072 && HZ < 6144 ++# define SHIFT_HZ 12 ++#elif HZ >= 6144 && HZ < 12288 ++# define SHIFT_HZ 13 + #else + # error You lose. + #endif +Index: linux-2.6.21-ck1/include/net/inet_timewait_sock.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/net/inet_timewait_sock.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/net/inet_timewait_sock.h 2007-05-04 12:24:21.000000000 +1000 +@@ -38,8 +38,8 @@ struct inet_hashinfo; + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +-#if HZ <= 16 || HZ > 4096 +-# error Unsupported: HZ <= 16 or HZ > 4096 ++#if HZ <= 16 || HZ > 16384 ++# error Unsupported: HZ <= 16 or HZ > 16384 + #elif HZ <= 32 + # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 64 +@@ -54,8 +54,12 @@ struct inet_hashinfo; + # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 2048 + # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +-#else ++#elif HZ <= 4096 + # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#elif HZ <= 8192 ++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#else ++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #endif + + /* TIME_WAIT reaping mechanism. */ +Index: linux-2.6.21-ck1/include/net/pkt_sched.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/net/pkt_sched.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/net/pkt_sched.h 2007-05-04 12:24:21.000000000 +1000 +@@ -78,8 +78,14 @@ typedef long psched_tdiff_t; + #define PSCHED_JSCALE 12 + #elif HZ >= 384 && HZ < 768 + #define PSCHED_JSCALE 11 +-#elif HZ >= 768 ++#elif HZ >= 768 && HZ < 1536 + #define PSCHED_JSCALE 10 ++#elif HZ >= 1536 && HZ < 3072 ++#define PSCHED_JSCALE 9 ++#elif HZ >= 3072 && HZ < 6144 ++#define PSCHED_JSCALE 8 ++#else ++#define PSCHED_JSCALE 7 + #endif + + #define PSCHED_GET_TIME(stamp) ((stamp) = (get_jiffies_64()<loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); + + return 0; +Index: linux-2.6.21-ck1/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-2.6.21-ck1.orig/arch/i386/kernel/smpboot.c 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/i386/kernel/smpboot.c 2007-05-04 12:24:21.000000000 +1000 +@@ -1134,7 +1134,7 @@ static void __init smp_boot_cpus(unsigne + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), +- (bogosum/(5000/HZ))%100); ++ (bogosum * 10/(50000/HZ))%100); + + Dprintk("Before bogocount - setting activated=1.\n"); + +Index: linux-2.6.21-ck1/include/linux/nfsd/stats.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/nfsd/stats.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/nfsd/stats.h 2007-05-04 12:24:21.000000000 +1000 +@@ -35,8 +35,8 @@ struct nfsd_stats { + + }; + +-/* thread usage wraps very million seconds (approx one fortnight) */ +-#define NFSD_USAGE_WRAP (HZ*1000000) ++/* thread usage wraps every one hundred thousand seconds (approx one day) */ ++#define NFSD_USAGE_WRAP (HZ*100000) + + #ifdef __KERNEL__ + +Index: linux-2.6.21-ck1/arch/x86_64/kernel/setup.c +=================================================================== +--- linux-2.6.21-ck1.orig/arch/x86_64/kernel/setup.c 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/x86_64/kernel/setup.c 2007-05-04 12:24:22.000000000 +1000 +@@ -1053,7 +1053,7 @@ static int show_cpuinfo(struct seq_file + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 new file mode 100644 index 00000000000..81fa14e2abe --- /dev/null +++ b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 @@ -0,0 +1,5167 @@ +Index: linux-2.6.22-ck1/include/linux/sched.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/sched.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/sched.h 2007-07-10 14:55:21.000000000 +1000 +@@ -34,9 +34,14 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO 5 + + #ifdef __KERNEL__ + ++#define SCHED_MAX SCHED_IDLEPRIO ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++ + struct sched_param { + int sched_priority; + }; +@@ -129,7 +134,7 @@ + extern unsigned long nr_active(void); + extern unsigned long nr_iowait(void); + extern unsigned long weighted_cpuload(const int cpu); +- ++extern int above_background_load(void); + + /* + * Task state bitmask. NOTE! These bits are also +@@ -150,8 +155,7 @@ + #define EXIT_ZOMBIE 16 + #define EXIT_DEAD 32 + /* in tsk->state again */ +-#define TASK_NONINTERACTIVE 64 +-#define TASK_DEAD 128 ++#define TASK_DEAD 64 + + #define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +@@ -537,14 +541,19 @@ + + #define MAX_USER_RT_PRIO 100 + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#define PRIO_RANGE (40) ++#define ISO_PRIO (MAX_RT_PRIO - 1) + +-#define MAX_PRIO (MAX_RT_PRIO + 40) ++#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) + +-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_prio(prio) unlikely((prio) < ISO_PRIO) + #define rt_task(p) rt_prio((p)->prio) + #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) + #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++#define iso_task(p) unlikely((p)->policy == SCHED_ISO) ++#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) + + /* + * Some day this will be a full-fledged user tracking system.. +@@ -809,13 +818,6 @@ + struct pipe_inode_info; + struct uts_namespace; + +-enum sleep_type { +- SLEEP_NORMAL, +- SLEEP_NONINTERACTIVE, +- SLEEP_INTERACTIVE, +- SLEEP_INTERRUPTED, +-}; +- + struct prio_array; + + struct task_struct { +@@ -835,20 +837,33 @@ + int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; + struct list_head run_list; ++ /* ++ * This bitmap shows what priorities this task has received quota ++ * from for this major priority rotation on its current runqueue. ++ */ ++ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); + struct prio_array *array; ++ /* Which major runqueue rotation did this task run */ ++ unsigned long rotation; + + unsigned short ioprio; + #ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; + #endif +- unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ +- enum sleep_type sleep_type; + + unsigned int policy; + cpumask_t cpus_allowed; +- unsigned int time_slice, first_time_slice; ++ /* ++ * How much this task is entitled to run at the current priority ++ * before being requeued at a lower priority. ++ */ ++ int time_slice; ++ /* Is this the very first time_slice this task has ever run. */ ++ unsigned int first_time_slice; ++ /* How much this task receives at each priority level */ ++ int quota; + + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + struct sched_info sched_info; +@@ -1013,6 +1028,7 @@ + struct held_lock held_locks[MAX_LOCK_DEPTH]; + unsigned int lockdep_recursion; + #endif ++ unsigned long mutexes_held; + + /* journalling filesystem info */ + void *journal_info; +@@ -1181,9 +1197,11 @@ + #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ + #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ ++#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ + #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ + #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ + #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ ++#define PF_NONSLEEP 0x80000000 /* Waiting on in-kernel activity */ + + /* + * Only the _current_ task can read/write to tsk->flags, but other +@@ -1253,7 +1271,7 @@ + #endif + + extern void set_user_nice(struct task_struct *p, long nice); +-extern int task_prio(const struct task_struct *p); ++extern int task_prio(struct task_struct *p); + extern int task_nice(const struct task_struct *p); + extern int can_nice(const struct task_struct *p, const int nice); + extern int task_curr(const struct task_struct *p); +Index: linux-2.6.22-ck1/kernel/sched.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/sched.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/sched.c 2007-07-10 14:55:24.000000000 +1000 +@@ -16,6 +16,7 @@ + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas + */ + + #include +@@ -53,8 +54,9 @@ + #include + #include + #include +- ++#include + #include ++ + #include + + /* +@@ -84,147 +86,85 @@ + #define USER_PRIO(p) ((p)-MAX_RT_PRIO) + #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) ++#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) + +-/* +- * Some helpers for converting nanosecond timing to jiffy resolution +- */ +-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) ++/* Some helpers for converting to/from various scales.*/ + #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +- +-/* +- * These are the 'tuning knobs' of the scheduler: +- * +- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), +- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. +- * Timeslices get refilled after they expire. +- */ +-#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +-#define DEF_TIMESLICE (100 * HZ / 1000) +-#define ON_RUNQUEUE_WEIGHT 30 +-#define CHILD_PENALTY 95 +-#define PARENT_PENALTY 100 +-#define EXIT_WEIGHT 3 +-#define PRIO_BONUS_RATIO 25 +-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +-#define INTERACTIVE_DELTA 2 +-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +-#define STARVATION_LIMIT (MAX_SLEEP_AVG) +-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +- +-/* +- * If a task is 'interactive' then we reinsert it in the active +- * array after it has expired its current timeslice. (it will not +- * continue to run immediately, it will still roundrobin with +- * other interactive tasks.) +- * +- * This part scales the interactivity limit depending on niceness. +- * +- * We scale it linearly, offset by the INTERACTIVE_DELTA delta. +- * Here are a few examples of different nice levels: +- * +- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] +- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] +- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] +- * +- * (the X axis represents the possible -5 ... 0 ... +5 dynamic +- * priority range a task can explore, a value of '1' means the +- * task is rated interactive.) +- * +- * Ie. nice +19 tasks can never get 'interactive' enough to be +- * reinserted into the active array. And only heavily CPU-hog nice -20 +- * tasks will be expired. Default nice 0 tasks are somewhere between, +- * it takes some effort for them to get interactive, but it's not +- * too hard. +- */ +- +-#define CURRENT_BONUS(p) \ +- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ +- MAX_SLEEP_AVG) +- +-#define GRANULARITY (10 * HZ / 1000 ? : 1) +- +-#ifdef CONFIG_SMP +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ +- num_online_cpus()) +-#else +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +-#endif +- +-#define SCALE(v1,v1_max,v2_max) \ +- (v1) * (v2_max) / (v1_max) +- +-#define DELTA(p) \ +- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ +- INTERACTIVE_DELTA) +- +-#define TASK_INTERACTIVE(p) \ +- ((p)->prio <= (p)->static_prio - DELTA(p)) +- +-#define INTERACTIVE_SLEEP(p) \ +- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ +- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +- +-#define TASK_PREEMPTS_CURR(p, rq) \ +- ((p)->prio < (rq)->curr->prio) +- +-#define SCALE_PRIO(x, prio) \ +- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) +- +-static unsigned int static_prio_timeslice(int static_prio) +-{ +- if (static_prio < NICE_TO_PRIO(0)) +- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); +- else +- return SCALE_PRIO(DEF_TIMESLICE, static_prio); +-} +- +-#ifdef CONFIG_SMP +-/* +- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) +- * Since cpu_power is a 'constant', we can use a reciprocal divide. ++#define MS_TO_NS(TIME) ((TIME) * 1000000) ++#define MS_TO_US(TIME) ((TIME) * 1000) ++#define US_TO_MS(TIME) ((TIME) / 1000) ++ ++#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 10ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run (over ISO_PERIOD seconds) as real time tasks. ++ * sched_iso_period - sysctl which determines the number of seconds over ++ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are ++ * exceeding their allowable bandwidth. ++*/ ++int sched_iso_cpu __read_mostly = 80; ++int sched_iso_period __read_mostly = 5; ++ ++#define ISO_PERIOD ((sched_iso_period * HZ) + 1) ++ ++/* ++ * This contains a bitmap for each dynamic priority level with empty slots ++ * for the valid priorities each different nice level can have. It allows ++ * us to stagger the slots where differing priorities run in a way that ++ * keeps latency differences between different nice levels at a minimum. ++ * The purpose of a pre-generated matrix is for rapid lookup of next slot in ++ * O(1) time without having to recalculate every time priority gets demoted. ++ * All nice levels use priority slot 39 as this allows less niced tasks to ++ * get all priority slots better than that before expiration is forced. ++ * ie, where 0 means a slot for that priority, priority running from left to ++ * right is from prio 0 to prio 39: ++ * nice -20 0000000000000000000000000000000000000000 ++ * nice -10 1000100010001000100010001000100010010000 ++ * nice 0 1010101010101010101010101010101010101010 ++ * nice 5 1011010110110101101101011011010110110110 ++ * nice 10 1110111011101110111011101110111011101110 ++ * nice 15 1111111011111110111111101111111011111110 ++ * nice 19 1111111111111111111111111111111111111110 + */ +-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +-{ +- return reciprocal_divide(load, sg->reciprocal_cpu_power); +-} ++static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] ++ __read_mostly; + +-/* +- * Each time a sched group cpu_power is changed, +- * we must compute its reciprocal value +- */ +-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +-{ +- sg->__cpu_power += val; +- sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +-} +-#endif ++struct rq; + + /* +- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] +- * to time slice values: [800ms ... 100ms ... 5ms] +- * +- * The higher a thread's priority, the bigger timeslices +- * it gets during one round of execution. But even the lowest +- * priority thread gets MIN_TIMESLICE worth of execution time. ++ * These are the runqueue data structures: + */ ++struct prio_array { ++ /* Tasks queued at each priority */ ++ struct list_head queue[MAX_PRIO + 1]; + +-static inline unsigned int task_timeslice(struct task_struct *p) +-{ +- return static_prio_timeslice(p->static_prio); +-} ++ /* ++ * The bitmap of priorities queued for this array. While the expired ++ * array will never have realtime tasks on it, it is simpler to have ++ * equal sized bitmaps for a cheap array swap. Include 1 bit for ++ * delimiter. ++ */ ++ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); + +-/* +- * These are the runqueue data structures: +- */ ++ /* ++ * The best static priority (of the dynamic priority tasks) queued ++ * this array. ++ */ ++ int best_static_prio; + +-struct prio_array { +- unsigned int nr_active; +- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ +- struct list_head queue[MAX_PRIO]; ++#ifdef CONFIG_SMP ++ /* For convenience looks back at rq */ ++ struct rq *rq; ++#endif + }; + + /* +@@ -260,14 +200,28 @@ + */ + unsigned long nr_uninterruptible; + +- unsigned long expired_timestamp; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; +- struct prio_array *active, *expired, arrays[2]; +- int best_expired_prio; ++ ++ struct prio_array *active, *expired, *idleprio, arrays[2]; ++ unsigned long *dyn_bitmap, *exp_bitmap; ++ ++ /* ++ * The current dynamic priority level this runqueue is at per static ++ * priority level. ++ */ ++ int prio_level[PRIO_RANGE]; ++ ++ /* How many times we have rotated the priority queue */ ++ unsigned long prio_rotation; ++ unsigned long iso_ticks; ++ unsigned short iso_refractory; ++ ++ /* Number of idleprio tasks running */ ++ unsigned long nr_idleprio; + atomic_t nr_iowait; + + #ifdef CONFIG_SMP +@@ -606,12 +560,9 @@ + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + /* + * Called when a process is dequeued from the active array and given +- * the cpu. We should note that with the exception of interactive +- * tasks, the expired queue will become the active queue after the active +- * queue is empty, without explicitly dequeuing and requeuing tasks in the +- * expired queue. (Interactive tasks may be requeued directly to the +- * active queue, thus delaying tasks in the expired queue from running; +- * see scheduler_tick()). ++ * the cpu. We should note that the expired queue will become the active ++ * queue after the active queue is empty, without explicitly dequeuing and ++ * requeuing tasks in the expired queue. + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple +@@ -709,71 +660,304 @@ + #define sched_info_switch(t, next) do { } while (0) + #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + ++static int idleprio_suitable(struct task_struct *p) ++{ ++ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && ++ !(p->flags & (PF_NONSLEEP | PF_EXITING))); ++} ++ ++static int idleprio(const struct task_struct *p) ++{ ++ return (p->prio == MAX_PRIO); ++} ++ ++static inline int task_queued(struct task_struct *task) ++{ ++ return !list_empty(&task->run_list); ++} ++ ++static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) ++{ ++ __set_bit(p->prio, p->array->prio_bitmap); ++} ++ + /* +- * Adding/removing a task to/from a priority array: ++ * Removing from a runqueue. + */ +-static void dequeue_task(struct task_struct *p, struct prio_array *array) ++static void dequeue_task(struct task_struct *p, struct rq *rq) + { +- array->nr_active--; +- list_del(&p->run_list); +- if (list_empty(array->queue + p->prio)) +- __clear_bit(p->prio, array->bitmap); ++ list_del_init(&p->run_list); ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio--; ++ else if (list_empty(p->array->queue + p->prio)) ++ __clear_bit(p->prio, p->array->prio_bitmap); + } + +-static void enqueue_task(struct task_struct *p, struct prio_array *array) ++static void reset_first_time_slice(struct task_struct *p) + { +- sched_info_queued(p); +- list_add_tail(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; ++ if (unlikely(p->first_time_slice)) ++ p->first_time_slice = 0; ++} ++ ++/* ++ * The task is being queued on a fresh array so it has its entitlement ++ * bitmap cleared. ++ */ ++static void task_new_array(struct task_struct *p, struct rq *rq, ++ struct prio_array *array) ++{ ++ bitmap_zero(p->bitmap, PRIO_RANGE); ++ p->rotation = rq->prio_rotation; ++ p->time_slice = p->quota; + p->array = array; ++ reset_first_time_slice(p); ++} ++ ++/* Find the first slot from the relevant prio_matrix entry */ ++static int first_prio_slot(struct task_struct *p) ++{ ++ if (unlikely(p->policy == SCHED_BATCH)) ++ return p->static_prio; ++ return SCHED_PRIO(find_first_zero_bit( ++ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); + } + + /* +- * Put task to the end of the run list without the overhead of dequeue +- * followed by enqueue. ++ * In sched_interactive mode priority allocation occurs per process per rq ++ * array swap. In !sched_interactive mode all waking tasks must obey the ++ * current prio level of all other tasks running per array swap. + */ +-static void requeue_task(struct task_struct *p, struct prio_array *array) ++static int minprio(struct rq *rq, int uprio) + { +- list_move_tail(&p->run_list, array->queue + p->prio); ++ if (sched_interactive) ++ return MAX_RT_PRIO; ++ return rq->prio_level[uprio]; + } + +-static inline void +-enqueue_task_head(struct task_struct *p, struct prio_array *array) ++/* ++ * Find the first unused slot by this task that is also in its prio_matrix ++ * level. SCHED_BATCH tasks do not use the priority matrix. They only take ++ * priority slots from their static_prio and above. ++ */ ++static int next_entitled_slot(struct task_struct *p, struct rq *rq) + { +- list_add(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; +- p->array = array; ++ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); ++ struct prio_array *array = rq->active; ++ DECLARE_BITMAP(tmp, PRIO_RANGE); ++ ++ /* ++ * Go straight to expiration if there are higher priority tasks ++ * already expired. ++ */ ++ if (p->static_prio > rq->expired->best_static_prio) ++ return MAX_PRIO; ++ if (!rq->prio_level[uprio]) ++ rq->prio_level[uprio] = MAX_RT_PRIO; ++ /* ++ * Only priorities equal to the prio_level and above for their ++ * static_prio are acceptable, and only if it's not better than ++ * a queued better static_prio's prio_level. ++ */ ++ if (p->static_prio < array->best_static_prio) { ++ if (likely(p->policy != SCHED_BATCH)) ++ array->best_static_prio = p->static_prio; ++ } else if (p->static_prio == array->best_static_prio) { ++ search_prio = minprio(rq, uprio); ++ } else { ++ int i; ++ ++ search_prio = minprio(rq, uprio); ++ /* A bound O(n) function, worst case n is 40 */ ++ for (i = array->best_static_prio; i <= p->static_prio ; i++) { ++ if (!rq->prio_level[USER_PRIO(i)]) ++ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; ++ search_prio = max(search_prio, ++ rq->prio_level[USER_PRIO(i)]); ++ } ++ } ++ if (unlikely(p->policy == SCHED_BATCH)) { ++ search_prio = max(search_prio, p->static_prio); ++ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++ } ++ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); ++ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++} ++ ++static void queue_expired(struct task_struct *p, struct rq *rq) ++{ ++ task_new_array(p, rq, rq->expired); ++ p->prio = p->normal_prio = first_prio_slot(p); ++ if (p->static_prio < rq->expired->best_static_prio) ++ rq->expired->best_static_prio = p->static_prio; ++ reset_first_time_slice(p); + } + ++#ifdef CONFIG_SMP + /* +- * __normal_prio - return the priority that is based on the static +- * priority but is modified by bonuses/penalties. +- * +- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] +- * into the -5 ... 0 ... +5 bonus/penalty range. +- * +- * We use 25% of the full 0...39 priority range so that: +- * +- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. +- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. +- * +- * Both properties are important to certain workloads. ++ * If we're waking up a task that was previously on a different runqueue, ++ * update its data appropriately. Note we may be reading data from src_rq-> ++ * outside of lock, but the occasional inaccurate result should be harmless. + */ ++ static void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++ struct rq *src_rq = p->array->rq; ++ ++ if (src_rq == rq) ++ return; ++ /* ++ * Only need to set p->array when p->rotation == rq->prio_rotation as ++ * they will be set in recalc_task_prio when != rq->prio_rotation. ++ */ ++ if (p->rotation == src_rq->prio_rotation) { ++ p->rotation = rq->prio_rotation; ++ if (p->array == src_rq->expired) ++ p->array = rq->expired; ++ else ++ p->array = rq->active; ++ } else ++ p->rotation = 0; ++} ++#else ++static inline void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++} ++#endif + +-static inline int __normal_prio(struct task_struct *p) ++static inline int isoprio_suitable(struct task_struct *p) + { +- int bonus, prio; ++ return !(p->flags & PF_ISOREF); ++} + +- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; ++static int task_timeslice(struct task_struct *p); + +- prio = p->static_prio - bonus; +- if (prio < MAX_RT_PRIO) +- prio = MAX_RT_PRIO; +- if (prio > MAX_PRIO-1) +- prio = MAX_PRIO-1; +- return prio; ++/* ++ * recalc_task_prio determines what priority a non rt_task will be ++ * queued at. If the task has already been running during this runqueue's ++ * major rotation (rq->prio_rotation) then it continues at the same ++ * priority if it has tick entitlement left. If it does not have entitlement ++ * left, it finds the next priority slot according to its nice value that it ++ * has not extracted quota from. If it has not run during this major ++ * rotation, it starts at the next_entitled_slot and has its bitmap quota ++ * cleared. If it does not have any slots left it has all its slots reset and ++ * is queued on the expired at its first_prio_slot. ++ */ ++static void recalc_task_prio(struct task_struct *p, struct rq *rq) ++{ ++ struct prio_array *array = rq->active; ++ int queue_prio; ++ ++ if (iso_task(p)) { ++ if (isoprio_suitable(p)) { ++ /* ++ * If SCHED_ISO tasks have not used up their real time ++ * quota they have run just better than highest ++ * SCHED_NORMAL priority. Otherwise they run as ++ * SCHED_NORMAL. ++ */ ++ p->prio = p->normal_prio = ISO_PRIO; ++ p->array = rq->active; ++ if (p->time_slice <= 0) ++ p->time_slice = p->quota; ++ return; ++ } else if (p->prio == ISO_PRIO) { ++ /* Just about to be demoted to SCHED_NORMAL */ ++ p->time_slice = 0; ++ } ++ } else if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ /* ++ * If suitable idleprio_tasks are queued at MAX_PRIO ++ * only on the idleprio array. Their time_slice is ++ * their full task_timeslice as they cooperatively ++ * multitask. ++ */ ++ p->prio = p->normal_prio = MAX_PRIO; ++ p->array = rq->idleprio; ++ if (p->time_slice <= 0) ++ p->time_slice = task_timeslice(p); ++ return; ++ } ++ /* ++ * If unsuitable idleprio_tasks are queued equivalent to ++ * nice 19 tasks on the expired array. ++ */ ++ p->flags &= ~PF_NONSLEEP; ++ p->prio = p->normal_prio = MAX_PRIO - 1; ++ p->array = rq->expired; ++ if (p->time_slice <= 0 || p->time_slice > p->quota) ++ p->time_slice = p->quota; ++ return; ++ } ++ ++ update_if_moved(p, rq); ++ if (p->rotation == rq->prio_rotation) { ++ if (p->array == array) { ++ if (p->time_slice > 0) ++ return; ++ p->time_slice = p->quota; ++ } else if (p->array == rq->expired) { ++ queue_expired(p, rq); ++ return; ++ } else ++ task_new_array(p, rq, array); ++ } else ++ task_new_array(p, rq, array); ++ ++ queue_prio = next_entitled_slot(p, rq); ++ if (queue_prio >= MAX_PRIO) { ++ queue_expired(p, rq); ++ return; ++ } ++ p->prio = p->normal_prio = queue_prio; ++ __set_bit(USER_PRIO(p->prio), p->bitmap); ++} ++ ++/* ++ * Adding to a runqueue. The dynamic priority queue that it is added to is ++ * determined by recalc_task_prio() above. ++ */ ++static inline void __enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ if (rt_task(p)) ++ p->array = rq->active; ++ else ++ recalc_task_prio(p, rq); ++ ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio++; ++ sched_info_queued(p); ++ set_dynamic_bit(p, rq); ++} ++ ++static void enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add_tail(&p->run_list, p->array->queue + p->prio); ++} ++ ++static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add(&p->run_list, p->array->queue + p->prio); ++} ++ ++/* ++ * requeue_task is only called when p->static_prio does not change. p->prio ++ * can change with dynamic tasks. ++ */ ++static void requeue_task(struct task_struct *p, struct rq *rq, ++ struct prio_array *old_array, int old_prio) ++{ ++ if (p->array == rq->expired) ++ queue_expired(p, rq); ++ list_move_tail(&p->run_list, p->array->queue + p->prio); ++ if (!rt_task(p)) { ++ if (list_empty(old_array->queue + old_prio)) ++ __clear_bit(old_prio, old_array->prio_bitmap); ++ set_dynamic_bit(p, rq); ++ } + } + + /* +@@ -786,20 +970,29 @@ + */ + + /* +- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE +- * If static_prio_timeslice() is ever changed to break this assumption then +- * this code will need modification +- */ +-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +-#define LOAD_WEIGHT(lp) \ +- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +-#define PRIO_TO_LOAD_WEIGHT(prio) \ +- LOAD_WEIGHT(static_prio_timeslice(prio)) +-#define RTPRIO_TO_LOAD_WEIGHT(rp) \ +- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) ++ * task_timeslice - the total duration a task can run during one major ++ * rotation. Returns value in milliseconds as the smallest value can be 1. ++ */ ++static int task_timeslice(struct task_struct *p) ++{ ++ int slice = p->quota; /* quota is in us */ ++ ++ if (!rt_task(p)) ++ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; ++ return US_TO_MS(slice); ++} ++ ++/* ++ * The load weight is basically the task_timeslice in ms. Realtime tasks are ++ * special cased to be proportionately larger than nice -20 by their ++ * rt_priority. The weight for rt tasks can only be arbitrary at best. ++ */ ++#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) + + static void set_load_weight(struct task_struct *p) + { ++ int load_weight; ++ + if (has_rt_policy(p)) { + #ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) +@@ -808,12 +1001,19 @@ + * Giving its load any weight will skew balancing + * adversely. + */ +- p->load_weight = 0; ++ load_weight = 0; + else + #endif +- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); ++ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else +- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); ++ load_weight = task_timeslice(p); ++ /* ++ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but ++ * still need to be weighted to allow balancing to occur. ++ */ ++ if (likely(!idleprio_task(p))) ++ load_weight *= PRIO_RANGE; ++ p->load_weight = load_weight; + } + + static inline void +@@ -841,28 +1041,38 @@ + } + + /* +- * Calculate the expected normal priority: i.e. priority +- * without taking RT-inheritance into account. Might be +- * boosted by interactivity modifiers. Changes upon fork, +- * setprio syscalls, and whenever the interactivity +- * estimator recalculates. ++ * __activate_task - move a task to the runqueue. + */ +-static inline int normal_prio(struct task_struct *p) ++static inline void __activate_task(struct task_struct *p, struct rq *rq) + { +- int prio; ++ enqueue_task(p, rq); ++ inc_nr_running(p, rq); ++} + ++/* ++ * __activate_idle_task - move idle task to the _front_ of runqueue. ++ */ ++static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task_head(p, rq); ++ inc_nr_running(p, rq); ++} ++ ++static inline int normal_prio(struct task_struct *p) ++{ + if (has_rt_policy(p)) +- prio = MAX_RT_PRIO-1 - p->rt_priority; ++ return MAX_RT_PRIO-1 - p->rt_priority; ++ /* Other tasks all have normal_prio set in recalc_task_prio */ ++ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) ++ return p->prio; + else +- prio = __normal_prio(p); +- return prio; ++ return p->static_prio; + } + + /* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might +- * be boosted by RT tasks, or might be boosted by +- * interactivity modifiers. Will be RT if the task got ++ * be boosted by RT tasks as it will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ + static int effective_prio(struct task_struct *p) +@@ -878,112 +1088,70 @@ + return p->prio; + } + +-/* +- * __activate_task - move a task to the runqueue. +- */ +-static void __activate_task(struct task_struct *p, struct rq *rq) ++static inline unsigned int nice_quota_ms(int nice) + { +- struct prio_array *target = rq->active; ++ unsigned int rr = rr_interval; + +- if (batch_task(p)) +- target = rq->expired; +- enqueue_task(p, target); +- inc_nr_running(p, rq); ++ if (nice < -6) { ++ rr *= nice * nice; ++ rr /= 40; ++ } else if (nice > 0) ++ rr = rr / 2 ? : 1; ++ return rr; + } + ++#define DEFAULT_WEIGHT (nice_quota_ms(0) * 20 * PRIO_RANGE) ++ + /* +- * __activate_idle_task - move idle task to the _front_ of runqueue. ++ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of ++ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a ++ * task of nice 0 or enough lower priority tasks to bring up the ++ * weighted_cpuload + */ +-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) ++int above_background_load(void) + { +- enqueue_task_head(p, rq->active); +- inc_nr_running(p, rq); ++ unsigned long cpu; ++ ++ for_each_online_cpu(cpu) { ++ if (weighted_cpuload(cpu) >= DEFAULT_WEIGHT) ++ return 1; ++ } ++ return 0; + } + + /* +- * Recalculate p->normal_prio and p->prio after having slept, +- * updating the sleep-average too: ++ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. ++ * From nice 1 to 19 they are smaller than it only if they are at least one ++ * tick still. Below nice 0 they get progressively larger. ++ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval ++ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. ++ * Value returned is in microseconds. + */ +-static int recalc_task_prio(struct task_struct *p, unsigned long long now) ++static inline unsigned int rr_quota(struct task_struct *p) + { +- /* Caller must always ensure 'now >= p->timestamp' */ +- unsigned long sleep_time = now - p->timestamp; ++ unsigned int quota; + +- if (batch_task(p)) +- sleep_time = 0; +- +- if (likely(sleep_time > 0)) { +- /* +- * This ceiling is set to the lowest priority that would allow +- * a task to be reinserted into the active array on timeslice +- * completion. +- */ +- unsigned long ceiling = INTERACTIVE_SLEEP(p); +- +- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { +- /* +- * Prevents user tasks from achieving best priority +- * with one single large enough sleep. +- */ +- p->sleep_avg = ceiling; +- /* +- * Using INTERACTIVE_SLEEP() as a ceiling places a +- * nice(0) task 1ms sleep away from promotion, and +- * gives it 700ms to round-robin with no chance of +- * being demoted. This is more than generous, so +- * mark this sleep as non-interactive to prevent the +- * on-runqueue bonus logic from intervening should +- * this task not receive cpu immediately. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else { +- /* +- * Tasks waking from uninterruptible sleep are +- * limited in their sleep_avg rise as they +- * are likely to be waiting on I/O +- */ +- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { +- if (p->sleep_avg >= ceiling) +- sleep_time = 0; +- else if (p->sleep_avg + sleep_time >= +- ceiling) { +- p->sleep_avg = ceiling; +- sleep_time = 0; +- } +- } +- +- /* +- * This code gives a bonus to interactive tasks. +- * +- * The boost works by updating the 'average sleep time' +- * value here, based on ->timestamp. The more time a +- * task spends sleeping, the higher the average gets - +- * and the higher the priority boost gets as well. +- */ +- p->sleep_avg += sleep_time; +- +- } +- if (p->sleep_avg > NS_MAX_SLEEP_AVG) +- p->sleep_avg = NS_MAX_SLEEP_AVG; +- } ++ if (rt_task(p)) ++ quota = rr_interval; ++ else ++ quota = nice_quota_ms(TASK_NICE(p)); ++ return MS_TO_US(quota); ++} + +- return effective_prio(p); ++/* Every time we set the quota we need to set the load weight */ ++static void set_quota(struct task_struct *p) ++{ ++ p->quota = rr_quota(p); ++ set_load_weight(p); + } + + /* + * activate_task - move a task to the runqueue and do priority recalculation +- * +- * Update all the scheduling statistics stuff. (sleep average +- * calculation, priority modifiers, etc.) + */ + static void activate_task(struct task_struct *p, struct rq *rq, int local) + { +- unsigned long long now; +- +- if (rt_task(p)) +- goto out; ++ unsigned long long now = sched_clock(); + +- now = sched_clock(); + #ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ +@@ -1004,32 +1172,9 @@ + (now - p->timestamp) >> 20); + } + +- p->prio = recalc_task_prio(p, now); +- +- /* +- * This checks to make sure it's not an uninterruptible task +- * that is now waking up. +- */ +- if (p->sleep_type == SLEEP_NORMAL) { +- /* +- * Tasks which were woken up by interrupts (ie. hw events) +- * are most likely of interactive nature. So we give them +- * the credit of extending their sleep time to the period +- * of time they spend on the runqueue, waiting for execution +- * on a CPU, first time around: +- */ +- if (in_interrupt()) +- p->sleep_type = SLEEP_INTERRUPTED; +- else { +- /* +- * Normal first-time wakeups get a credit too for +- * on-runqueue time, but it will be weighted down: +- */ +- p->sleep_type = SLEEP_INTERACTIVE; +- } +- } ++ set_quota(p); ++ p->prio = effective_prio(p); + p->timestamp = now; +-out: + __activate_task(p, rq); + } + +@@ -1039,8 +1184,7 @@ + static void deactivate_task(struct task_struct *p, struct rq *rq) + { + dec_nr_running(p, rq); +- dequeue_task(p, p->array); +- p->array = NULL; ++ dequeue_task(p, rq); + } + + /* +@@ -1133,7 +1277,7 @@ + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ +- if (!p->array && !task_running(rq, p)) { ++ if (!task_queued(p) && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } +@@ -1159,7 +1303,6 @@ + { + unsigned long flags; + struct rq *rq; +- struct prio_array *array; + int running; + + repeat: +@@ -1192,7 +1335,6 @@ + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); +- array = p->array; + task_rq_unlock(rq, &flags); + + /* +@@ -1215,7 +1357,7 @@ + * running right now), it's preempted, and we should + * yield - it could be a while. + */ +- if (unlikely(array)) { ++ if (unlikely(task_queued(p))) { + yield(); + goto repeat; + } +@@ -1294,6 +1436,25 @@ + } + + /* ++ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) ++ * Since cpu_power is a 'constant', we can use a reciprocal divide. ++ */ ++static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) ++{ ++ return reciprocal_divide(load, sg->reciprocal_cpu_power); ++} ++ ++/* ++ * Each time a sched group cpu_power is changed, ++ * we must compute its reciprocal value ++ */ ++static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) ++{ ++ sg->__cpu_power += val; ++ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); ++} ++ ++/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +@@ -1490,6 +1651,31 @@ + } + #endif + ++/* ++ * We need to have a special definition for an idle runqueue when testing ++ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as ++ * a realtime task in sched_idle_next. ++ */ ++#ifdef CONFIG_HOTPLUG_CPU ++#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) ++#else ++#define rq_idle(rq) ((rq)->curr == (rq)->idle) ++#endif ++ ++static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ return ((p->array == task_rq(p)->active && ++ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); ++} ++ ++static inline void try_preempt(struct task_struct *p, struct rq *rq) ++{ ++ if (task_preempts_curr(p, rq)) ++ resched_task(rq->curr); ++} ++ + /*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread +@@ -1521,7 +1707,7 @@ + if (!(old_state & state)) + goto out; + +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + cpu = task_cpu(p); +@@ -1614,7 +1800,7 @@ + old_state = p->state; + if (!(old_state & state)) + goto out; +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + this_cpu = smp_processor_id(); +@@ -1623,25 +1809,9 @@ + + out_activate: + #endif /* CONFIG_SMP */ +- if (old_state == TASK_UNINTERRUPTIBLE) { ++ if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; +- /* +- * Tasks on involuntary sleep don't earn +- * sleep_avg beyond just interactive state. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else +- +- /* +- * Tasks that have marked their sleep as noninteractive get +- * woken up with their sleep average not weighted in an +- * interactive way. +- */ +- if (old_state & TASK_NONINTERACTIVE) +- p->sleep_type = SLEEP_NONINTERACTIVE; +- + +- activate_task(p, rq, cpu == this_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) +@@ -1650,15 +1820,22 @@ + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ +- if (!sync || cpu != this_cpu) { +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); +- } ++ activate_task(p, rq, cpu == this_cpu); ++ if (!sync || cpu != this_cpu) ++ try_preempt(p, rq); + success = 1; + + out_running: + p->state = TASK_RUNNING; + out: ++ /* ++ * Special case when freezing we need to reschedule idleprio tasks ++ * as SCHED_NORMAL or else they'll never freeze ++ */ ++ if (idleprio_task(p) && freezing(p) && idleprio(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ } + task_rq_unlock(rq, &flags); + + return success; +@@ -1676,7 +1853,6 @@ + return try_to_wake_up(p, state, 0); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p); + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -1704,7 +1880,6 @@ + p->prio = current->normal_prio; + + INIT_LIST_HEAD(&p->run_list); +- p->array = NULL; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +@@ -1716,30 +1891,31 @@ + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; + #endif ++ if (unlikely(p->policy == SCHED_FIFO)) ++ goto out; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); +- p->time_slice = (current->time_slice + 1) >> 1; +- /* +- * The remainder of the first timeslice might be recovered by +- * the parent if the child exits early enough. +- */ +- p->first_time_slice = 1; +- current->time_slice >>= 1; +- p->timestamp = sched_clock(); +- if (unlikely(!current->time_slice)) { ++ if (current->time_slice > 0) { ++ current->time_slice /= 2; ++ if (current->time_slice) ++ p->time_slice = current->time_slice; ++ else ++ p->time_slice = 1; + /* +- * This case is rare, it happens when the parent has only +- * a single jiffy left from its timeslice. Taking the +- * runqueue lock is not a problem. ++ * The remainder of the first timeslice might be recovered by ++ * the parent if the child exits early enough. + */ +- current->time_slice = 1; +- task_running_tick(cpu_rq(cpu), current); +- } ++ p->first_time_slice = 1; ++ } else ++ p->time_slice = 0; ++ ++ p->timestamp = sched_clock(); + local_irq_enable(); ++out: + put_cpu(); + } + +@@ -1761,38 +1937,16 @@ + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +- /* +- * We decrease the sleep average of forking parents +- * and children as well, to keep max-interactive tasks +- * from forking tasks that are max-interactive. The parent +- * (current) is done further down, under its lock. +- */ +- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * +- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); +- +- p->prio = effective_prio(p); +- + if (likely(cpu == this_cpu)) { ++ activate_task(p, rq, 1); + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ +- if (unlikely(!current->array)) +- __activate_task(p, rq); +- else { +- p->prio = current->prio; +- p->normal_prio = current->normal_prio; +- list_add_tail(&p->run_list, ¤t->run_list); +- p->array = current->array; +- p->array->nr_active++; +- inc_nr_running(p, rq); +- } + set_need_resched(); +- } else +- /* Run child last */ +- __activate_task(p, rq); ++ } + /* + * We skip the following code due to cpu == this_cpu + * +@@ -1809,19 +1963,16 @@ + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; +- __activate_task(p, rq); +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ activate_task(p, rq, 0); ++ try_preempt(p, rq); + + /* + * Parent and child are on different CPUs, now get the +- * parent runqueue to update the parent's ->sleep_avg: ++ * parent runqueue to update the parent's ->flags: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } +- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * +- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); + } + +@@ -1836,23 +1987,17 @@ + */ + void fastcall sched_exit(struct task_struct *p) + { ++ struct task_struct *parent; + unsigned long flags; + struct rq *rq; + +- /* +- * If the child was a (relative-) CPU hog then decrease +- * the sleep_avg of the parent as well. +- */ +- rq = task_rq_lock(p->parent, &flags); +- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { +- p->parent->time_slice += p->time_slice; +- if (unlikely(p->parent->time_slice > task_timeslice(p))) +- p->parent->time_slice = task_timeslice(p); +- } +- if (p->sleep_avg < p->parent->sleep_avg) +- p->parent->sleep_avg = p->parent->sleep_avg / +- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / +- (EXIT_WEIGHT + 1); ++ parent = p->parent; ++ rq = task_rq_lock(parent, &flags); ++ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { ++ parent->time_slice += p->time_slice; ++ if (unlikely(parent->time_slice > parent->quota)) ++ parent->time_slice = parent->quota; ++ } + task_rq_unlock(rq, &flags); + } + +@@ -2184,23 +2329,17 @@ + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +-static void pull_task(struct rq *src_rq, struct prio_array *src_array, +- struct task_struct *p, struct rq *this_rq, +- struct prio_array *this_array, int this_cpu) ++static void pull_task(struct rq *src_rq, struct task_struct *p, ++ struct rq *this_rq, int this_cpu) + { +- dequeue_task(p, src_array); ++ dequeue_task(p, src_rq); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); +- enqueue_task(p, this_array); ++ enqueue_task(p, this_rq); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; +- /* +- * Note that idle threads have a prio of MAX_PRIO, for this test +- * to be always true for them. +- */ +- if (TASK_PREEMPTS_CURR(p, this_rq)) +- resched_task(this_rq->curr); ++ try_preempt(p, this_rq); + } + + /* +@@ -2243,7 +2382,16 @@ + return 1; + } + +-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) ++static inline int rq_best_prio(struct rq *rq) ++{ ++ int best_prio, exp_prio; ++ ++ best_prio = sched_find_first_bit(rq->dyn_bitmap); ++ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ if (unlikely(best_prio > exp_prio)) ++ best_prio = exp_prio; ++ return best_prio; ++} + + /* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted +@@ -2259,7 +2407,7 @@ + { + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; +- struct prio_array *array, *dst_array; ++ struct prio_array *array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; +@@ -2286,31 +2434,29 @@ + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ +- if (busiest->expired->nr_active) { +- array = busiest->expired; +- dst_array = this_rq->expired; +- } else { +- array = busiest->active; +- dst_array = this_rq->active; +- } +- ++ array = busiest->expired; + new_array: +- /* Start searching at priority 0: */ +- idx = 0; ++ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ ++ if (array == busiest->expired) ++ idx = MAX_RT_PRIO; ++ else ++ idx = 0; + skip_bitmap: + if (!idx) +- idx = sched_find_first_bit(array->bitmap); ++ idx = sched_find_first_bit(array->prio_bitmap); + else +- idx = find_next_bit(array->bitmap, MAX_PRIO, idx); +- if (idx >= MAX_PRIO) { +- if (array == busiest->expired && busiest->active->nr_active) { ++ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); ++ if (idx == MAX_PRIO) { ++ if (array == busiest->idleprio && busiest->nr_idleprio) ++ goto found_idleprio; ++ if (array == busiest->expired) { + array = busiest->active; +- dst_array = this_rq->active; + goto new_array; + } + goto out; + } + ++found_idleprio: + head = array->queue + idx; + curr = head->prev; + skip_queue: +@@ -2332,11 +2478,22 @@ + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ /* ++ * Occurs either when balancing idleprio tasks or ++ * there really are no more tasks to find. ++ */ ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } + +- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); ++ pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + rem_load_move -= tmp->load_weight; + +@@ -2349,6 +2506,13 @@ + this_best_prio = idx; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } +@@ -3297,11 +3461,36 @@ + /* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ * The value returned from sched_clock() occasionally gives bogus values so ++ * some sanity checking is required. + */ +-static inline void +-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) ++static void ++update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, ++ int tick) + { +- p->sched_time += now - p->last_ran; ++ long time_diff = now - p->last_ran; ++ ++ if (tick) { ++ /* ++ * Called from scheduler_tick() there should be less than two ++ * jiffies worth, and not negative/overflow. ++ */ ++ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) ++ time_diff = JIFFIES_TO_NS(1); ++ } else { ++ /* ++ * Called from context_switch there should be less than one ++ * jiffy worth, and not negative/overflow. There should be ++ * some time banked here so use a nominal 1us. ++ */ ++ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) ++ time_diff = 1000; ++ } ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p != rq->idle && p->policy != SCHED_FIFO) ++ p->time_slice -= time_diff / 1000; ++ p->sched_time += time_diff; + p->last_ran = rq->most_recent_timestamp = now; + } + +@@ -3322,27 +3511,6 @@ + } + + /* +- * We place interactive tasks back into the active array, if possible. +- * +- * To guarantee that this does not starve expired tasks we ignore the +- * interactivity of a task if the first expired task had to wait more +- * than a 'reasonable' amount of time. This deadline timeout is +- * load-dependent, as the frequency of array switched decreases with +- * increasing number of running tasks. We also ignore the interactivity +- * if a better static_prio task has expired: +- */ +-static inline int expired_starving(struct rq *rq) +-{ +- if (rq->curr->static_prio > rq->best_expired_prio) +- return 1; +- if (!STARVATION_LIMIT || !rq->expired_timestamp) +- return 0; +- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) +- return 1; +- return 0; +-} +- +-/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() +@@ -3357,7 +3525,7 @@ + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0 || idleprio_task(p)) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +@@ -3415,87 +3583,94 @@ + cpustat->steal = cputime64_add(cpustat->steal, tmp); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p) ++/* ++ * The task has used up its quota of running in this prio_level so it must be ++ * dropped a priority level, all managed by recalc_task_prio(). ++ */ ++static void task_expired_entitlement(struct rq *rq, struct task_struct *p) + { +- if (p->array != rq->active) { +- /* Task has expired but was not scheduled yet */ +- set_tsk_need_resched(p); ++ int overrun; ++ ++ reset_first_time_slice(p); ++ if (rt_task(p)) { ++ p->time_slice += p->quota; ++ list_move_tail(&p->run_list, p->array->queue + p->prio); + return; + } +- spin_lock(&rq->lock); ++ overrun = p->time_slice; ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); + /* +- * The task was running during this tick - update the +- * time slice counter. Note: we do not update a thread's +- * priority until it either goes to sleep or uses up its +- * timeslice. This makes it possible for interactive tasks +- * to use up their timeslices at their highest priority levels. ++ * Subtract any extra time this task ran over its time_slice; ie ++ * overrun will either be 0 or negative. + */ +- if (rt_task(p)) { +- /* +- * RR tasks need a special form of timeslice management. +- * FIFO tasks have no timeslices. +- */ +- if ((p->policy == SCHED_RR) && !--p->time_slice) { +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; +- set_tsk_need_resched(p); ++ p->time_slice += overrun; ++} + +- /* put it at the end of the queue: */ +- requeue_task(p, rq->active); +- } +- goto out_unlock; ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. ++ */ ++static unsigned int test_ret_isorefractory(struct rq *rq) ++{ ++ if (likely(!rq->iso_refractory)) { ++ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) ++ rq->iso_refractory = 1; ++ } else { ++ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) ++ rq->iso_refractory = 0; + } +- if (!--p->time_slice) { +- dequeue_task(p, rq->active); +- set_tsk_need_resched(p); +- p->prio = effective_prio(p); +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; ++ return rq->iso_refractory; ++} + +- if (!rq->expired_timestamp) +- rq->expired_timestamp = jiffies; +- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { +- enqueue_task(p, rq->expired); +- if (p->static_prio < rq->best_expired_prio) +- rq->best_expired_prio = p->static_prio; +- } else +- enqueue_task(p, rq->active); +- } else { +- /* +- * Prevent a too long timeslice allowing a task to monopolize +- * the CPU. We do this by splitting up the timeslice into +- * smaller pieces. +- * +- * Note: this does not mean the task's timeslices expire or +- * get lost in any way, they just might be preempted by +- * another task of equal priority. (one with higher +- * priority would have preempted this task already.) We +- * requeue this task to the end of the list on this priority +- * level, which is in essence a round-robin of tasks with +- * equal priority. +- * +- * This only applies to tasks in the interactive +- * delta range with at least TIMESLICE_GRANULARITY to requeue. +- */ +- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - +- p->time_slice) % TIMESLICE_GRANULARITY(p)) && +- (p->time_slice >= TIMESLICE_GRANULARITY(p)) && +- (p->array == rq->active)) { ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++} + +- requeue_task(p, rq->active); +- set_tsk_need_resched(p); +- } ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { ++ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) ++ rq->iso_ticks += 100; ++ } else ++ no_iso_tick(rq); ++ ++ if (iso_task(p)) { ++ if (unlikely(test_ret_isorefractory(rq))) { ++ if (isoprio_suitable(p)) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Set the PF_ISOREF flag and ++ * force it to reschedule as SCHED_NORMAL ++ * by zeroing its time_slice ++ */ ++ p->flags |= PF_ISOREF; ++ p->time_slice = 0; ++ } ++ } else ++ p->flags &= ~PF_ISOREF; + } +-out_unlock: +- spin_unlock(&rq->lock); ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->time_slice > 0 || p->policy == SCHED_FIFO) ++ return; ++ /* p->time_slice <= 0 */ ++ set_tsk_need_resched(p); ++ if (likely(task_queued(p))) ++ task_expired_entitlement(rq, p); + } + + /* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. +- * +- * It also gets called by the fork code, when changing the parent's +- * timeslices. + */ + void scheduler_tick(void) + { +@@ -3505,10 +3680,14 @@ + int idle_at_tick = idle_cpu(cpu); + struct rq *rq = cpu_rq(cpu); + +- update_cpu_clock(p, rq, now); ++ update_cpu_clock(p, rq, now, 1); + ++ spin_lock(&rq->lock); + if (!idle_at_tick) + task_running_tick(rq, p); ++ else ++ no_iso_tick(rq); ++ spin_unlock(&rq->lock); + #ifdef CONFIG_SMP + update_load(rq); + rq->idle_at_tick = idle_at_tick; +@@ -3554,10 +3733,80 @@ + + #endif + +-static inline int interactive_sleep(enum sleep_type sleep_type) ++static void reset_prio_levels(struct rq *rq) + { +- return (sleep_type == SLEEP_INTERACTIVE || +- sleep_type == SLEEP_INTERRUPTED); ++ rq->active->best_static_prio = MAX_PRIO - 1; ++ rq->expired->best_static_prio = MAX_PRIO - 1; ++ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); ++} ++ ++/* ++ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the ++ * idleprio array and if it isn't already active ++ */ ++static struct task_struct *next_idleprio_task(struct rq *rq) ++{ ++ struct prio_array *array = rq->active; ++ struct list_head *queue; ++ ++ if (array != rq->idleprio) { ++ rq->active = rq->idleprio; ++ rq->expired = array; ++ array = rq->active; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ } ++ rq->prio_rotation++; ++ reset_prio_levels(rq); ++ queue = array->queue + MAX_PRIO; ++ return list_entry(queue->next, struct task_struct, run_list); ++} ++ ++/* ++ * next_dynamic_task finds the next suitable dynamic task. ++ */ ++static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) ++{ ++ struct prio_array *array = rq->active; ++ struct task_struct *next; ++ struct list_head *queue; ++ int nstatic; ++ ++retry: ++ if (unlikely(rq->nr_running == rq->nr_idleprio)) ++ return next_idleprio_task(rq); ++ if (idx >= MAX_PRIO) { ++ /* There are no more tasks in the active array. Swap arrays */ ++ array = rq->expired; ++ rq->expired = rq->active; ++ rq->active = array; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->prio_rotation++; ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ reset_prio_levels(rq); ++ } ++ queue = array->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); ++ if (unlikely(next->time_slice <= 0 && !(iso_task(next) && ++ isoprio_suitable(next)))) { ++ /* ++ * Unlucky enough that this task ran out of time_slice ++ * before it hit a scheduler_tick so it should have its ++ * priority reassessed and choose another task (possibly ++ * the same one) ++ */ ++ task_expired_entitlement(rq, next); ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ goto retry; ++ } ++ next->rotation = rq->prio_rotation; ++ nstatic = next->static_prio; ++ if (nstatic < array->best_static_prio) ++ array->best_static_prio = nstatic; ++ if (idx > rq->prio_level[USER_PRIO(nstatic)]) ++ rq->prio_level[USER_PRIO(nstatic)] = idx; ++ return next; + } + + /* +@@ -3566,13 +3815,11 @@ + asmlinkage void __sched schedule(void) + { + struct task_struct *prev, *next; +- struct prio_array *array; + struct list_head *queue; + unsigned long long now; +- unsigned long run_time; +- int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; ++ int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into +@@ -3608,18 +3855,6 @@ + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); +- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { +- run_time = now - prev->timestamp; +- if (unlikely((long long)(now - prev->timestamp) < 0)) +- run_time = 0; +- } else +- run_time = NS_MAX_SLEEP_AVG; +- +- /* +- * Tasks charged proportionately less run_time at high sleep_avg to +- * delay them losing their interactive status +- */ +- run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + +@@ -3630,8 +3865,10 @@ + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { +- if (prev->state == TASK_UNINTERRUPTIBLE) ++ if (prev->state == TASK_UNINTERRUPTIBLE) { ++ prev->flags |= PF_NONSLEEP; + rq->nr_uninterruptible++; ++ } + deactivate_task(prev, rq); + } + } +@@ -3641,59 +3878,29 @@ + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; +- rq->expired_timestamp = 0; + goto switch_tasks; + } + } + +- array = rq->active; +- if (unlikely(!array->nr_active)) { +- /* +- * Switch the active and expired arrays. +- */ +- schedstat_inc(rq, sched_switch); +- rq->active = rq->expired; +- rq->expired = array; +- array = rq->active; +- rq->expired_timestamp = 0; +- rq->best_expired_prio = MAX_PRIO; +- } +- +- idx = sched_find_first_bit(array->bitmap); +- queue = array->queue + idx; +- next = list_entry(queue->next, struct task_struct, run_list); +- +- if (!rt_task(next) && interactive_sleep(next->sleep_type)) { +- unsigned long long delta = now - next->timestamp; +- if (unlikely((long long)(now - next->timestamp) < 0)) +- delta = 0; +- +- if (next->sleep_type == SLEEP_INTERACTIVE) +- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; +- +- array = next->array; +- new_prio = recalc_task_prio(next, next->timestamp + delta); +- +- if (unlikely(next->prio != new_prio)) { +- dequeue_task(next, array); +- next->prio = new_prio; +- enqueue_task(next, array); +- } ++ idx = sched_find_first_bit(rq->dyn_bitmap); ++ if (likely(idx > ISO_PRIO)) ++ next = next_dynamic_task(rq, idx); ++ else { ++ queue = rq->active->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); + } +- next->sleep_type = SLEEP_NORMAL; + switch_tasks: +- if (next == rq->idle) ++ if (next == rq->idle) { ++ reset_prio_levels(rq); ++ rq->prio_rotation++; + schedstat_inc(rq, sched_goidle); ++ } + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + +- update_cpu_clock(prev, rq, now); +- +- prev->sleep_avg -= run_time; +- if ((long)prev->sleep_avg <= 0) +- prev->sleep_avg = 0; ++ update_cpu_clock(prev, rq, now, 0); + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); +@@ -4129,29 +4336,22 @@ + */ + void rt_mutex_setprio(struct task_struct *p, int prio) + { +- struct prio_array *array; + unsigned long flags; ++ int queued, oldprio; + struct rq *rq; +- int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; +- array = p->array; +- if (array) +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p, rq); + p->prio = prio; + +- if (array) { +- /* +- * If changing to an RT priority then queue it +- * in the active array! +- */ +- if (rt_task(p)) +- array = rq->active; +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on +@@ -4160,8 +4360,8 @@ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + task_rq_unlock(rq, &flags); + } +@@ -4170,8 +4370,7 @@ + + void set_user_nice(struct task_struct *p, long nice) + { +- struct prio_array *array; +- int old_prio, delta; ++ int queued, old_prio,delta; + unsigned long flags; + struct rq *rq; + +@@ -4192,26 +4391,27 @@ + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } +- array = p->array; +- if (array) { +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) { ++ dequeue_task(p, rq); + dec_raw_weighted_load(rq, p); + } + + p->static_prio = NICE_TO_PRIO(nice); +- set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); ++ set_quota(p); + delta = p->prio - old_prio; + +- if (array) { +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + inc_raw_weighted_load(rq, p); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ +- if (delta < 0 || (delta > 0 && task_running(rq, p))) ++ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && ++ task_running(rq, p))) + resched_task(rq->curr); + } + out_unlock: +@@ -4281,11 +4481,23 @@ + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered +- * around 0, value goes from -16 to +15. ++ * around 1, value goes from 0 to +79. Values higher than ++ * 39 indicate task is on the expired array. This is done ++ * lockless and may rarely return an active instead of ++ * expired value. + */ +-int task_prio(const struct task_struct *p) ++int task_prio(struct task_struct *p) + { +- return p->prio - MAX_RT_PRIO; ++ int prio = p->prio - MAX_RT_PRIO; ++ ++ if (task_queued(p)) { ++ struct rq *rq = task_rq(p); ++ struct prio_array *array = p->array; ++ ++ if (rq && rq->expired == array) ++ prio += PRIO_RANGE; ++ } ++ return prio; + } + + /** +@@ -4328,19 +4540,14 @@ + /* Actually do priority change: must hold rq lock. */ + static void __setscheduler(struct task_struct *p, int policy, int prio) + { +- BUG_ON(p->array); ++ BUG_ON(task_queued(p)); + + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); +- /* +- * SCHED_BATCH tasks are treated as perpetual CPU hogs: +- */ +- if (policy == SCHED_BATCH) +- p->sleep_avg = 0; +- set_load_weight(p); ++ set_quota(p); + } + + /** +@@ -4354,19 +4561,36 @@ + int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) + { +- int retval, oldprio, oldpolicy = -1; +- struct prio_array *array; ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ int queued, retval, oldprio, oldpolicy = -1; ++ unsigned long rlim_rtprio = 0; + unsigned long flags; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } + recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; +- else if (policy != SCHED_FIFO && policy != SCHED_RR && +- policy != SCHED_NORMAL && policy != SCHED_BATCH) ++ else if (!SCHED_RANGE(policy)) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are +@@ -4385,14 +4609,6 @@ + */ + if (!capable(CAP_SYS_NICE)) { + if (is_rt_policy(policy)) { +- unsigned long rlim_rtprio; +- unsigned long flags; +- +- if (!lock_task_sighand(p, &flags)) +- return -ESRCH; +- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; +- unlock_task_sighand(p, &flags); +- + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; +@@ -4401,6 +4617,31 @@ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy == SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } + } + + /* can't change other user's priorities */ +@@ -4409,6 +4650,11 @@ + return -EPERM; + } + ++ if (!(p->mm) && policy == SCHED_IDLEPRIO) { ++ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ ++ return -EINVAL; ++ } ++ + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; +@@ -4429,12 +4675,12 @@ + spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); +- if (array) { ++ if (queued) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and +@@ -4444,14 +4690,15 @@ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + ++out: + return 0; + } + EXPORT_SYMBOL_GPL(sched_setscheduler); +@@ -4718,41 +4965,34 @@ + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by moving the calling thread +- * to the expired array. If there are no other threads running on this +- * CPU then this function will return. ++ * to the expired array if SCHED_NORMAL or the end of its current priority ++ * queue if a realtime task. If there are no other threads running on this ++ * cpu this function will return. + */ + asmlinkage long sys_sched_yield(void) + { + struct rq *rq = this_rq_lock(); +- struct prio_array *array = current->array, *target = rq->expired; ++ struct task_struct *p = current; + + schedstat_inc(rq, yld_cnt); +- /* +- * We implement yielding by moving the task into the expired +- * queue. +- * +- * (special rule: RT tasks will just roundrobin in the active +- * array.) +- */ +- if (rt_task(current)) +- target = rq->active; +- +- if (array->nr_active == 1) { +- schedstat_inc(rq, yld_act_empty); +- if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_both_empty); +- } else if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_exp_empty); +- +- if (array != target) { +- dequeue_task(current, array); +- enqueue_task(current, target); +- } else +- /* +- * requeue_task is cheaper so perform that if possible. +- */ +- requeue_task(current, array); ++ if (rq->nr_running == 1) ++ schedstat_inc(rq, yld_both_empty); ++ else { ++ struct prio_array *old_array = p->array; ++ int old_prio = p->prio; ++ ++ if (idleprio_task(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ goto out_release; ++ } ++ /* p->prio will be updated in requeue_task via queue_expired */ ++ if (!rt_task(p)) ++ p->array = rq->expired; ++ requeue_task(p, rq, old_array, old_prio); ++ } + ++out_release: + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: +@@ -4902,6 +5142,8 @@ + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + break; + } +@@ -4926,6 +5168,8 @@ + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + } + return ret; +@@ -4959,8 +5203,8 @@ + if (retval) + goto out_unlock; + +- jiffies_to_timespec(p->policy == SCHED_FIFO ? +- 0 : task_timeslice(p), &t); ++ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : ++ MS_TO_NS(task_timeslice(p))); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + out_nounlock: +@@ -5056,10 +5300,10 @@ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + +- idle->timestamp = sched_clock(); +- idle->sleep_avg = 0; +- idle->array = NULL; +- idle->prio = idle->normal_prio = MAX_PRIO; ++ bitmap_zero(idle->bitmap, PRIO_RANGE); ++ idle->timestamp = idle->last_ran = sched_clock(); ++ idle->array = rq->active; ++ idle->prio = idle->normal_prio = NICE_TO_PRIO(0); + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_cpu(idle, cpu); +@@ -5178,7 +5422,7 @@ + goto out; + + set_task_cpu(p, dest_cpu); +- if (p->array) { ++ if (task_queued(p)) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step +@@ -5189,8 +5433,7 @@ + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + __activate_task(p, rq_dest); +- if (TASK_PREEMPTS_CURR(p, rq_dest)) +- resched_task(rq_dest->curr); ++ try_preempt(p, rq_dest); + } + ret = 1; + out: +@@ -5487,7 +5730,7 @@ + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); +- rq->idle->static_prio = MAX_PRIO; ++ rq->idle->static_prio = NICE_TO_PRIO(0); + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); +@@ -7013,6 +7256,13 @@ + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); ++ ++ /* ++ * Assume that every added cpu gives us slightly less overall latency ++ * allowing us to increase the base rr_interval, but in a non linear ++ * fashion. ++ */ ++ rr_interval *= 1 + ilog2(num_online_cpus()); + } + #else + void __init sched_init_smp(void) +@@ -7035,6 +7285,16 @@ + int i, j, k; + int highest_cpu = 0; + ++ /* Generate the priority matrix */ ++ for (i = 0; i < PRIO_RANGE; i++) { ++ bitmap_fill(prio_matrix[i], PRIO_RANGE); ++ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); ++ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { ++ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), ++ prio_matrix[i]); ++ } ++ } ++ + for_each_possible_cpu(i) { + struct prio_array *array; + struct rq *rq; +@@ -7042,12 +7302,20 @@ + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + lockdep_set_class(&rq->lock, &rq->rq_lock_key); ++ rq->iso_ticks = 0; + rq->nr_running = 0; ++ rq->nr_idleprio = 0; ++ rq->prio_rotation = 0; + rq->active = rq->arrays; ++ rq->idleprio = rq->active; + rq->expired = rq->arrays + 1; +- rq->best_expired_prio = MAX_PRIO; ++ reset_prio_levels(rq); ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->exp_bitmap = rq->expired->prio_bitmap; + + #ifdef CONFIG_SMP ++ rq->active->rq = rq; ++ rq->expired->rq = rq; + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; +@@ -7060,17 +7328,16 @@ + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { ++ + array = rq->arrays + j; +- for (k = 0; k < MAX_PRIO; k++) { ++ for (k = 0; k <= MAX_PRIO; k++) + INIT_LIST_HEAD(array->queue + k); +- __clear_bit(k, array->bitmap); +- } +- // delimiter for bitsearch +- __set_bit(MAX_PRIO, array->bitmap); ++ bitmap_zero(array->prio_bitmap, MAX_PRIO); ++ /* delimiter for bitsearch */ ++ __set_bit(MAX_PRIO, array->prio_bitmap); + } + highest_cpu = i; + } +- + set_load_weight(&init_task); + + #ifdef CONFIG_SMP +@@ -7125,25 +7392,25 @@ + #ifdef CONFIG_MAGIC_SYSRQ + void normalize_rt_tasks(void) + { +- struct prio_array *array; + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; ++ int queued; + + read_lock_irq(&tasklist_lock); + + do_each_thread(g, p) { +- if (!rt_task(p)) ++ if (!rt_task(p) && !iso_task(p)) + continue; + + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); + +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); +- if (array) { ++ if (queued) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } +Index: linux-2.6.22-ck1/kernel/sysctl.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/sysctl.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/sysctl.c 2007-07-10 14:55:23.000000000 +1000 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -70,6 +71,7 @@ + extern char core_pattern[]; + extern int pid_max; + extern int min_free_kbytes; ++extern int vm_tail_largefiles; + extern int printk_ratelimit_jiffies; + extern int printk_ratelimit_burst; + extern int pid_max_min, pid_max_max; +@@ -78,6 +80,10 @@ + extern int compat_log; + extern int maps_protect; + extern int sysctl_stat_interval; ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_iso_period; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ + static int maxolduid = 65535; +@@ -161,6 +167,14 @@ + #endif + + ++/* Constants for minimum and maximum testing. ++ We use these as one-element integer vectors. */ ++static int __read_mostly zero; ++static int __read_mostly one = 1; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly five_thousand = 5000; ++ ++ + /* The default sysctl tables: */ + + static ctl_table root_table[] = { +@@ -501,6 +515,47 @@ + .mode = 0444, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &five_thousand, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_period", ++ .data = &sched_iso_period, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &one_hundred, ++ }, + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .ctl_name = KERN_UNKNOWN_NMI_PANIC, +@@ -619,14 +674,16 @@ + { .ctl_name = 0 } + }; + +-/* Constants for minimum and maximum testing in vm_table. +- We use these as one-element integer vectors. */ +-static int zero; +-static int one_hundred = 100; +- +- + static ctl_table vm_table[] = { + { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "tail_largefiles", ++ .data = &vm_tail_largefiles, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { + .ctl_name = VM_OVERCOMMIT_MEMORY, + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, +@@ -705,16 +762,24 @@ + .proc_handler = &proc_dointvec, + }, + { +- .ctl_name = VM_SWAPPINESS, +- .procname = "swappiness", +- .data = &vm_swappiness, +- .maxlen = sizeof(vm_swappiness), ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "mapped", ++ .data = &vm_mapped, ++ .maxlen = sizeof(vm_mapped), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "hardmaplimit", ++ .data = &vm_hardmaplimit, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, + #ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, +@@ -882,6 +947,32 @@ + .extra1 = &zero, + }, + #endif ++#ifdef CONFIG_SWAP_PREFETCH ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch", ++ .data = &swap_prefetch, ++ .maxlen = sizeof(swap_prefetch), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch_delay", ++ .data = &swap_prefetch_delay, ++ .maxlen = sizeof(swap_prefetch_delay), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch_sleep", ++ .data = &swap_prefetch_sleep, ++ .maxlen = sizeof(swap_prefetch_sleep), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { .ctl_name = 0 } + }; + +Index: linux-2.6.22-ck1/Documentation/sched-design.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/sched-design.txt 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/sched-design.txt 2007-07-10 14:55:02.000000000 +1000 +@@ -1,11 +1,14 @@ +- Goals, Design and Implementation of the +- new ultra-scalable O(1) scheduler ++ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by ++ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by ++ Con Kolivas. + + +- This is an edited version of an email Ingo Molnar sent to +- lkml on 4 Jan 2002. It describes the goals, design, and +- implementation of Ingo's new ultra-scalable O(1) scheduler. +- Last Updated: 18 April 2002. ++ This was originally an edited version of an email Ingo Molnar sent to ++ lkml on 4 Jan 2002. It describes the goals, design, and implementation ++ of Ingo's ultra-scalable O(1) scheduler. It now contains a description ++ of the Staircase Deadline priority scheduler that was built on this ++ design. ++ Last Updated: Fri, 4 May 2007 + + + Goal +@@ -163,3 +166,222 @@ + code is smaller than the old one. + + Ingo ++ ++ ++Staircase Deadline cpu scheduler policy ++================================================ ++ ++Design summary ++============== ++ ++A novel design which incorporates a foreground-background descending priority ++system (the staircase) via a bandwidth allocation matrix according to nice ++level. ++ ++ ++Features ++======== ++ ++A starvation free, strict fairness O(1) scalable design with interactivity ++as good as the above restrictions can provide. There is no interactivity ++estimator, no sleep/run measurements and only simple fixed accounting. ++The design has strict enough a design and accounting that task behaviour ++can be modelled and maximum scheduling latencies can be predicted by ++the virtual deadline mechanism that manages runqueues. The prime concern ++in this design is to maintain fairness at all costs determined by nice level, ++yet to maintain as good interactivity as can be allowed within the ++constraints of strict fairness. ++ ++ ++Design description ++================== ++ ++SD works off the principle of providing each task a quota of runtime that it is ++allowed to run at a number of priority levels determined by its static priority ++(ie. its nice level). If the task uses up its quota it has its priority ++decremented to the next level determined by a priority matrix. Once every ++runtime quota has been consumed of every priority level, a task is queued on the ++"expired" array. When no other tasks exist with quota, the expired array is ++activated and fresh quotas are handed out. This is all done in O(1). ++ ++Design details ++============== ++ ++Each task keeps a record of its own entitlement of cpu time. Most of the rest of ++these details apply to non-realtime tasks as rt task management is straight ++forward. ++ ++Each runqueue keeps a record of what major epoch it is up to in the ++rq->prio_rotation field which is incremented on each major epoch. It also ++keeps a record of the current prio_level for each static priority task. ++ ++Each task keeps a record of what major runqueue epoch it was last running ++on in p->rotation. It also keeps a record of what priority levels it has ++already been allocated quota from during this epoch in a bitmap p->bitmap. ++ ++The only tunable that determines all other details is the RR_INTERVAL. This ++is set to 8ms, and is scaled gently upwards with more cpus. This value is ++tunable via a /proc interface. ++ ++All tasks are initially given a quota based on RR_INTERVAL. This is equal to ++RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and ++progressively larger for nice values from -1 to -20. This is assigned to ++p->quota and only changes with changes in nice level. ++ ++As a task is first queued, it checks in recalc_task_prio to see if it has run at ++this runqueue's current priority rotation. If it has not, it will have its ++p->prio level set according to the first slot in a "priority matrix" and will be ++given a p->time_slice equal to the p->quota, and has its allocation bitmap bit ++set in p->bitmap for this prio level. It is then queued on the current active ++priority array. ++ ++If a task has already been running during this major epoch, and it has ++p->time_slice left and the rq->prio_quota for the task's p->prio still ++has quota, it will be placed back on the active array, but no more quota ++will be added. ++ ++If a task has been running during this major epoch, but does not have ++p->time_slice left, it will find the next lowest priority in its bitmap that it ++has not been allocated quota from. It then gets the a full quota in ++p->time_slice. It is then queued on the current active priority array at the ++newly determined lower priority. ++ ++If a task has been running during this major epoch, and does not have ++any entitlement left in p->bitmap and no time_slice left, it will have its ++bitmap cleared, and be queued at its best prio again, but on the expired ++priority array. ++ ++When a task is queued, it has its relevant bit set in the array->prio_bitmap. ++ ++p->time_slice is stored in nanosconds and is updated via update_cpu_clock on ++schedule() and scheduler_tick. If p->time_slice is below zero then the ++recalc_task_prio is readjusted and the task rescheduled. ++ ++ ++Priority Matrix ++=============== ++ ++In order to minimise the latencies between tasks of different nice levels ++running concurrently, the dynamic priority slots where different nice levels ++are queued are dithered instead of being sequential. What this means is that ++there are 40 priority slots where a task may run during one major rotation, ++and the allocation of slots is dependant on nice level. In the ++following table, a zero represents a slot where the task may run. ++ ++PRIORITY:0..................20.................39 ++nice -20 0000000000000000000000000000000000000000 ++nice -10 1000100010001000100010001000100010010000 ++nice 0 1010101010101010101010101010101010101010 ++nice 5 1011010110110101101101011011010110110110 ++nice 10 1110111011101110111011101110111011101110 ++nice 15 1111111011111110111111101111111011111110 ++nice 19 1111111111111111111111111111111111111110 ++ ++As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 ++task only runs one slot per major rotation. This dithered table allows for the ++smallest possible maximum latencies between tasks of varying nice levels, thus ++allowing vastly different nice levels to be used. ++ ++SCHED_BATCH tasks are managed slightly differently, receiving only the top ++slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but ++slightly higher latencies. ++ ++ ++Modelling deadline behaviour ++============================ ++ ++As the accounting in this design is hard and not modified by sleep average ++calculations or interactivity modifiers, it is possible to accurately ++predict the maximum latency that a task may experience under different ++conditions. This is a virtual deadline mechanism enforced by mandatory ++timeslice expiration and not outside bandwidth measurement. ++ ++The maximum duration a task can run during one major epoch is determined by its ++nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL ++duration during each epoch. Nice 10 tasks can run at 9 priority levels for each ++epoch, and so on. The table in the priority matrix above demonstrates how this ++is enforced. ++ ++Therefore the maximum duration a runqueue epoch can take is determined by ++the number of tasks running, and their nice level. After that, the maximum ++duration it can take before a task can wait before it get scheduled is ++determined by the position of its first slot on the matrix. ++ ++In the following examples, these are _worst case scenarios_ and would rarely ++occur, but can be modelled nonetheless to determine the maximum possible ++latency. ++ ++So for example, if two nice 0 tasks are running, and one has just expired as ++another is activated for the first time receiving a full quota for this ++runqueue rotation, the first task will wait: ++ ++nr_tasks * max_duration + nice_difference * rr_interval ++1 * 19 * RR_INTERVAL + 0 = 152ms ++ ++In the presence of a nice 10 task, a nice 0 task would wait a maximum of ++1 * 10 * RR_INTERVAL + 0 = 80ms ++ ++In the presence of a nice 0 task, a nice 10 task would wait a maximum of ++1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms ++ ++More useful than these values, though, are the average latencies which are ++a matter of determining the average distance between priority slots of ++different nice values and multiplying them by the tasks' quota. For example ++in the presence of a nice -10 task, a nice 0 task will wait either one or ++two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, ++this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or ++20 and 40ms respectively (on uniprocessor at 1000HZ). ++ ++ ++Achieving interactivity ++======================= ++ ++A requirement of this scheduler design was to achieve good interactivity ++despite being a completely fair deadline based design. The disadvantage of ++designs that try to achieve interactivity is that they usually do so at ++the expense of maintaining fairness. As cpu speeds increase, the requirement ++for some sort of metered unfairness towards interactive tasks becomes a less ++desirable phenomenon, but low latency and fairness remains mandatory to ++good interactive performance. ++ ++This design relies on the fact that interactive tasks, by their nature, ++sleep often. Most fair scheduling designs end up penalising such tasks ++indirectly giving them less than their fair possible share because of the ++sleep, and have to use a mechanism of bonusing their priority to offset ++this based on the duration they sleep. This becomes increasingly inaccurate ++as the number of running tasks rises and more tasks spend time waiting on ++runqueues rather than sleeping, and it is impossible to tell whether the ++task that's waiting on a runqueue only intends to run for a short period and ++then sleep again after than runqueue wait. Furthermore, all such designs rely ++on a period of time to pass to accumulate some form of statistic on the task ++before deciding on how much to give them preference. The shorter this period, ++the more rapidly bursts of cpu ruin the interactive tasks behaviour. The ++longer this period, the longer it takes for interactive tasks to get low ++scheduling latencies and fair cpu. ++ ++This design does not measure sleep time at all. Interactive tasks that sleep ++often will wake up having consumed very little if any of their quota for ++the current major priority rotation. The longer they have slept, the less ++likely they are to even be on the current major priority rotation. Once ++woken up, though, they get to use up a their full quota for that epoch, ++whether part of a quota remains or a full quota. Overall, however, they ++can still only run as much cpu time for that epoch as any other task of the ++same nice level. This means that two tasks behaving completely differently ++from fully cpu bound to waking/sleeping extremely frequently will still ++get the same quota of cpu, but the latter will be using its quota for that ++epoch in bursts rather than continuously. This guarantees that interactive ++tasks get the same amount of cpu as cpu bound ones. ++ ++The other requirement of interactive tasks is also to obtain low latencies ++for when they are scheduled. Unlike fully cpu bound tasks and the maximum ++latencies possible described in the modelling deadline behaviour section ++above, tasks that sleep will wake up with quota available usually at the ++current runqueue's priority_level or better. This means that the most latency ++they are likely to see is one RR_INTERVAL, and often they will preempt the ++current task if it is not of a sleeping nature. This then guarantees very ++low latency for interactive tasks, and the lowest latencies for the least ++cpu bound tasks. ++ ++ ++Fri, 4 May 2007 ++Con Kolivas +Index: linux-2.6.22-ck1/Documentation/sysctl/kernel.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/sysctl/kernel.txt 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/sysctl/kernel.txt 2007-07-10 14:55:20.000000000 +1000 +@@ -25,6 +25,9 @@ + - domainname + - hostname + - hotplug ++- interactive ++- iso_cpu ++- iso_period + - java-appletviewer [ binfmt_java, obsolete ] + - java-interpreter [ binfmt_java, obsolete ] + - kstack_depth_to_print [ X86 only ] +@@ -43,6 +46,7 @@ + - printk + - real-root-dev ==> Documentation/initrd.txt + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sem +@@ -164,6 +168,40 @@ + + ============================================================== + ++interactive: ++ ++The staircase-deadline cpu scheduler can be set in either purely ++forward-looking mode for absolutely rigid fairness and cpu distribution ++according to nice level, or it can allow a small per-process history ++to smooth out cpu usage perturbations common in interactive tasks by ++enabling this sysctl. While small fairness issues can arise with this ++enabled, overall fairness is usually still strongly maintained and ++starvation is never possible. Enabling this can significantly smooth ++out 3d graphics and games. ++ ++Default value is 1 (enabled). ++ ++============================================================== ++ ++iso_cpu: ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling iso_period ++seconds. ++ ++Set to 80 (percent) by default. ++ ++============================================================== ++ ++iso_period: ++ ++This sets the number of seconds over which SCHED_ISO cpu usage is averaged ++to see if it exceeds its allocated cpu bandwidth. ++ ++Set to 5 (seconds) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -288,6 +326,19 @@ + + ============================================================== + ++rr_interval: ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. This value is in milliseconds and the default value chosen ++depends on the number of cpus available at scheduler initialisation ++with a minimum of 8. ++ ++Valid values are from 1-5000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +Index: linux-2.6.22-ck1/fs/pipe.c +=================================================================== +--- linux-2.6.22-ck1.orig/fs/pipe.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/fs/pipe.c 2007-07-10 14:55:02.000000000 +1000 +@@ -41,12 +41,7 @@ + { + DEFINE_WAIT(wait); + +- /* +- * Pipes are system-local resources, so sleeping on them +- * is considered a noninteractive wait: +- */ +- prepare_to_wait(&pipe->wait, &wait, +- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); ++ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + schedule(); +Index: linux-2.6.22-ck1/fs/proc/array.c +=================================================================== +--- linux-2.6.22-ck1.orig/fs/proc/array.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/fs/proc/array.c 2007-07-10 14:55:02.000000000 +1000 +@@ -165,7 +165,6 @@ + rcu_read_lock(); + buffer += sprintf(buffer, + "State:\t%s\n" +- "SleepAVG:\t%lu%%\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" +@@ -173,7 +172,6 @@ + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), +- (p->sleep_avg/1024)*100/(1020000000/1024), + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, +Index: linux-2.6.22-ck1/include/linux/init_task.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/init_task.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/init_task.h 2007-07-10 14:55:20.000000000 +1000 +@@ -125,13 +125,15 @@ + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ ++ .rotation = 0, \ + .policy = SCHED_NORMAL, \ + .cpus_allowed = CPU_MASK_ALL, \ + .mm = NULL, \ + .active_mm = &init_mm, \ + .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .ioprio = 0, \ +- .time_slice = HZ, \ ++ .time_slice = 1000000000, \ ++ .quota = 1000000000, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ + .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ +@@ -158,6 +160,7 @@ + .signal = {{0}}}, \ + .blocked = {{0}}, \ + .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .mutexes_held = 0, \ + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .fs_excl = ATOMIC_INIT(0), \ +Index: linux-2.6.22-ck1/kernel/softirq.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/softirq.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/softirq.c 2007-07-10 14:55:02.000000000 +1000 +@@ -488,7 +488,7 @@ + + static int ksoftirqd(void * __bind_cpu) + { +- set_user_nice(current, 19); ++ set_user_nice(current, 15); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); +Index: linux-2.6.22-ck1/kernel/workqueue.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/workqueue.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/workqueue.c 2007-07-10 14:55:02.000000000 +1000 +@@ -285,8 +285,6 @@ + if (!cwq->wq->freezeable) + current->flags |= PF_NOFREEZE; + +- set_user_nice(current, -5); +- + for (;;) { + prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); + if (!freezing(current) && +Index: linux-2.6.22-ck1/kernel/kthread.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/kthread.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/kthread.c 2007-07-10 14:55:02.000000000 +1000 +@@ -223,7 +223,6 @@ + + ignore_signals(tsk); + +- set_user_nice(tsk, -5); + set_cpus_allowed(tsk, CPU_MASK_ALL); + } + +Index: linux-2.6.22-ck1/kernel/fork.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/fork.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/fork.c 2007-07-10 14:55:20.000000000 +1000 +@@ -1063,6 +1063,7 @@ + p->io_context = NULL; + p->io_wait = NULL; + p->audit_context = NULL; ++ p->mutexes_held = 0; + cpuset_fork(p); + #ifdef CONFIG_NUMA + p->mempolicy = mpol_copy(p->mempolicy); +Index: linux-2.6.22-ck1/kernel/mutex.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/mutex.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/mutex.c 2007-07-10 14:55:20.000000000 +1000 +@@ -60,6 +60,16 @@ + static void fastcall noinline __sched + __mutex_lock_slowpath(atomic_t *lock_count); + ++static inline void inc_mutex_count(void) ++{ ++ current->mutexes_held++; ++} ++ ++static inline void dec_mutex_count(void) ++{ ++ current->mutexes_held--; ++} ++ + /*** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired +@@ -89,6 +99,7 @@ + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); ++ inc_mutex_count(); + } + + EXPORT_SYMBOL(mutex_lock); +@@ -114,6 +125,7 @@ + * into 'unlocked' state: + */ + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); ++ dec_mutex_count(); + } + + EXPORT_SYMBOL(mutex_unlock); +@@ -283,9 +295,14 @@ + */ + int fastcall __sched mutex_lock_interruptible(struct mutex *lock) + { ++ int ret; ++ + might_sleep(); +- return __mutex_fastpath_lock_retval ++ ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_interruptible_slowpath); ++ if (likely(!ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_lock_interruptible); +@@ -340,8 +357,12 @@ + */ + int fastcall __sched mutex_trylock(struct mutex *lock) + { +- return __mutex_fastpath_trylock(&lock->count, ++ int ret = __mutex_fastpath_trylock(&lock->count, + __mutex_trylock_slowpath); ++ ++ if (likely(ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_trylock); +Index: linux-2.6.22-ck1/block/cfq-iosched.c +=================================================================== +--- linux-2.6.22-ck1.orig/block/cfq-iosched.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/block/cfq-iosched.c 2007-07-10 14:55:21.000000000 +1000 +@@ -1276,10 +1276,12 @@ + printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* +- * no prio set, place us in the middle of the BE classes ++ * Select class and ioprio according to policy and nice + */ ++ cfqq->ioprio_class = task_policy_ioprio_class(tsk); + cfqq->ioprio = task_nice_ioprio(tsk); +- cfqq->ioprio_class = IOPRIO_CLASS_BE; ++ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) ++ cfq_clear_cfqq_idle_window(cfqq); + break; + case IOPRIO_CLASS_RT: + cfqq->ioprio = task_ioprio(tsk); +Index: linux-2.6.22-ck1/include/linux/ioprio.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/ioprio.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/ioprio.h 2007-07-10 14:55:21.000000000 +1000 +@@ -22,7 +22,7 @@ + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +-enum { ++enum ioprio_class { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, +@@ -51,8 +51,25 @@ + return IOPRIO_PRIO_DATA(task->ioprio); + } + ++static inline enum ioprio_class ++ task_policy_ioprio_class(struct task_struct *task) ++{ ++ if (rt_task(task)) ++ return IOPRIO_CLASS_RT; ++ if (idleprio_task(task)) ++ return IOPRIO_CLASS_IDLE; ++ return IOPRIO_CLASS_BE; ++} ++ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (rt_task(task)) ++ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / ++ (MAX_RT_PRIO + 1); ++ if (iso_task(task)) ++ return 0; ++ if (idleprio_task(task)) ++ return IOPRIO_BE_NR - 1; + return (task_nice(task) + 20) / 5; + } + +Index: linux-2.6.22-ck1/Documentation/sysctl/vm.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/sysctl/vm.txt 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/sysctl/vm.txt 2007-07-10 14:55:23.000000000 +1000 +@@ -22,6 +22,8 @@ + - dirty_background_ratio + - dirty_expire_centisecs + - dirty_writeback_centisecs ++- hardmaplimit ++- mapped + - max_map_count + - min_free_kbytes + - laptop_mode +@@ -31,12 +33,15 @@ + - min_unmapped_ratio + - min_slab_ratio + - panic_on_oom ++- swap_prefetch ++- swap_prefetch_delay ++- swap_prefetch_sleep + + ============================================================== + + dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, + dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, +-block_dump, swap_token_timeout, drop-caches: ++block_dump, swap_token_timeout, drop-caches, tail_largefiles: + + See Documentation/filesystems/proc.txt + +@@ -86,6 +91,27 @@ + + ============================================================== + ++hardmaplimit: ++ ++This flag makes the vm adhere to the mapped value as closely as possible ++except in the most extreme vm stress where doing so would provoke an out ++of memory condition (see mapped below). ++ ++Enabled by default. ++ ++============================================================== ++ ++mapped: ++ ++This is the percentage ram that is filled with mapped pages (applications) ++before the vm will start reclaiming mapped pages by moving them to swap. ++It is altered by the relative stress of the vm at the time so is not ++strictly adhered to to prevent provoking out of memory kills. ++ ++Set to 66 by default. ++ ++============================================================== ++ + max_map_count: + + This file contains the maximum number of memory map areas a process +@@ -216,3 +242,37 @@ + The default value is 0. + 1 and 2 are for failover of clustering. Please select either + according to your policy of failover. ++ ++============================================================== ++ ++swap_prefetch ++ ++This enables or disables the swap prefetching feature. When the virtual ++memory subsystem has been extremely idle for at least swap_prefetch_sleep ++seconds it will start copying back pages from swap into the swapcache and keep ++a copy in swap. Valid values are 0 - 3. A value of 0 disables swap ++prefetching, 1 enables it unless laptop_mode is enabled, 2 enables it in the ++presence of laptop_mode, and 3 enables it unconditionally, ignoring whether ++the system is idle or not. If set to 0, swap prefetch wil not even try to keep ++record of ram swapped out to have the most minimal impact on performance. ++ ++The default value is 1. ++ ++============================================================== ++ ++swap_prefetch_delay ++ ++This is the time in seconds that swap prefetching is delayed upon finding ++the system is not idle (ie the vm is busy or non-niced cpu load is present). ++ ++The default value is 1. ++ ++============================================================== ++ ++swap_prefetch_sleep ++ ++This is the time in seconds that the swap prefetch kernel thread is put to ++sleep for when the ram is found to be full and it is unable to prefetch ++further. ++ ++The default value is 5. +Index: linux-2.6.22-ck1/include/linux/swap.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/swap.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/swap.h 2007-07-10 14:55:22.000000000 +1000 +@@ -180,6 +180,7 @@ + /* linux/mm/swap.c */ + extern void FASTCALL(lru_cache_add(struct page *)); + extern void FASTCALL(lru_cache_add_active(struct page *)); ++extern void FASTCALL(lru_cache_add_tail(struct page *)); + extern void FASTCALL(activate_page(struct page *)); + extern void FASTCALL(mark_page_accessed(struct page *)); + extern void lru_add_drain(void); +@@ -188,9 +189,11 @@ + extern void swap_setup(void); + + /* linux/mm/vmscan.c */ +-extern unsigned long try_to_free_pages(struct zone **, gfp_t); ++extern unsigned long try_to_free_pages(struct zone **, gfp_t, ++ struct task_struct *p); + extern unsigned long shrink_all_memory(unsigned long nr_pages); +-extern int vm_swappiness; ++extern int vm_mapped; ++extern int vm_hardmaplimit; + extern int remove_mapping(struct address_space *mapping, struct page *page); + extern long vm_total_pages; + +@@ -237,6 +240,7 @@ + extern struct page * lookup_swap_cache(swp_entry_t); + extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, + unsigned long addr); ++extern int add_to_swap_cache(struct page *page, swp_entry_t entry); + /* linux/mm/swapfile.c */ + extern long total_swap_pages; + extern unsigned int nr_swapfiles; +Index: linux-2.6.22-ck1/init/Kconfig +=================================================================== +--- linux-2.6.22-ck1.orig/init/Kconfig 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/init/Kconfig 2007-07-10 14:55:22.000000000 +1000 +@@ -105,6 +105,28 @@ + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + ++config SWAP_PREFETCH ++ bool "Support for prefetching swapped memory" ++ depends on SWAP ++ default y ++ ---help--- ++ This option will allow the kernel to prefetch swapped memory pages ++ when idle. The pages will be kept on both swap and in swap_cache ++ thus avoiding the need for further I/O if either ram or swap space ++ is required. ++ ++ What this will do on workstations is slowly bring back applications ++ that have swapped out after memory intensive workloads back into ++ physical ram if you have free ram at a later stage and the machine ++ is relatively idle. This means that when you come back to your ++ computer after leaving it idle for a while, applications will come ++ to life faster. Note that your swap usage will appear to increase ++ but these are cached pages, can be dropped freely by the vm, and it ++ should stabilise around 50% swap usage maximum. ++ ++ Workstations and multiuser workstation servers will most likely want ++ to say Y. ++ + config SYSVIPC + bool "System V IPC" + ---help--- +Index: linux-2.6.22-ck1/mm/Makefile +=================================================================== +--- linux-2.6.22-ck1.orig/mm/Makefile 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/Makefile 2007-07-10 14:55:22.000000000 +1000 +@@ -17,6 +17,7 @@ + obj-y += bounce.o + endif + obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o ++obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o + obj-$(CONFIG_HUGETLBFS) += hugetlb.o + obj-$(CONFIG_NUMA) += mempolicy.o + obj-$(CONFIG_SPARSEMEM) += sparse.o +Index: linux-2.6.22-ck1/mm/swap.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/swap.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/swap.c 2007-07-10 14:55:23.000000000 +1000 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -176,6 +177,7 @@ + */ + static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; + static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; ++static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; + + void fastcall lru_cache_add(struct page *page) + { +@@ -197,6 +199,31 @@ + put_cpu_var(lru_add_active_pvecs); + } + ++static void __pagevec_lru_add_tail(struct pagevec *pvec) ++{ ++ int i; ++ struct zone *zone = NULL; ++ ++ for (i = 0; i < pagevec_count(pvec); i++) { ++ struct page *page = pvec->pages[i]; ++ struct zone *pagezone = page_zone(page); ++ ++ if (pagezone != zone) { ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ zone = pagezone; ++ spin_lock_irq(&zone->lru_lock); ++ } ++ BUG_ON(PageLRU(page)); ++ SetPageLRU(page); ++ add_page_to_inactive_list_tail(zone, page); ++ } ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ release_pages(pvec->pages, pvec->nr, pvec->cold); ++ pagevec_reinit(pvec); ++} ++ + static void __lru_add_drain(int cpu) + { + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); +@@ -207,6 +234,9 @@ + pvec = &per_cpu(lru_add_active_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_active(pvec); ++ pvec = &per_cpu(lru_add_tail_pvecs, cpu); ++ if (pagevec_count(pvec)) ++ __pagevec_lru_add_tail(pvec); + } + + void lru_add_drain(void) +@@ -403,6 +433,20 @@ + } + + /* ++ * Function used uniquely to put pages back to the lru at the end of the ++ * inactive list to preserve the lru order. ++ */ ++void fastcall lru_cache_add_tail(struct page *page) ++{ ++ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); ++ ++ page_cache_get(page); ++ if (!pagevec_add(pvec, page)) ++ __pagevec_lru_add_tail(pvec); ++ put_cpu_var(lru_add_pvecs); ++} ++ ++/* + * Try to drop buffers from the pages in a pagevec + */ + void pagevec_strip(struct pagevec *pvec) +@@ -514,6 +558,9 @@ + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ ++ ++ prepare_swap_prefetch(); ++ + #ifdef CONFIG_HOTPLUG_CPU + hotcpu_notifier(cpu_swap_callback, 0); + #endif +Index: linux-2.6.22-ck1/mm/swap_prefetch.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.22-ck1/mm/swap_prefetch.c 2007-07-10 14:55:22.000000000 +1000 +@@ -0,0 +1,542 @@ ++/* ++ * linux/mm/swap_prefetch.c ++ * ++ * Copyright (C) 2005-2007 Con Kolivas ++ * ++ * Written by Con Kolivas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * sysctls: ++ * swap_prefetch: 0. Disable swap prefetching ++ * 1. Prefetch only when idle and not with laptop_mode ++ * 2. Prefetch when idle and with laptop_mode ++ * 3. Prefetch at all times. ++ * swap_prefetch_delay: Number of seconds to delay prefetching when system ++ * is not idle. ++ * swap_prefetch_sleep: Number of seconds to put kprefetchd to sleep when ++ * unable to prefetch. ++ */ ++int swap_prefetch __read_mostly = 1; ++int swap_prefetch_delay __read_mostly = 1; ++int swap_prefetch_sleep __read_mostly = 5; ++ ++#define PREFETCH_DELAY (HZ * swap_prefetch_delay) ++#define PREFETCH_SLEEP ((HZ * swap_prefetch_sleep) ? : 1) ++ ++struct swapped_root { ++ unsigned long busy; /* vm busy */ ++ spinlock_t lock; /* protects all data */ ++ struct list_head list; /* MRU list of swapped pages */ ++ struct radix_tree_root swap_tree; /* Lookup tree of pages */ ++ unsigned int count; /* Number of entries */ ++ unsigned int maxcount; /* Maximum entries allowed */ ++ struct kmem_cache *cache; /* Of struct swapped_entry */ ++}; ++ ++static struct swapped_root swapped = { ++ .lock = SPIN_LOCK_UNLOCKED, ++ .list = LIST_HEAD_INIT(swapped.list), ++ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), ++}; ++ ++static struct task_struct *kprefetchd_task; ++ ++/* ++ * We check to see no part of the vm is busy. If it is this will interrupt ++ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. ++ */ ++inline void delay_swap_prefetch(void) ++{ ++ if (!test_bit(0, &swapped.busy)) ++ __set_bit(0, &swapped.busy); ++} ++ ++/* ++ * If laptop_mode is enabled don't prefetch to avoid hard drives ++ * doing unnecessary spin-ups unless swap_prefetch is explicitly ++ * set to a higher value. ++ */ ++static inline int prefetch_enabled(void) ++{ ++ if (swap_prefetch <= laptop_mode) ++ return 0; ++ return 1; ++} ++ ++static int kprefetchd_awake; ++ ++/* ++ * Drop behind accounting which keeps a list of the most recently used swap ++ * entries. Entries are removed lazily by kprefetchd. ++ */ ++void add_to_swapped_list(struct page *page) ++{ ++ struct swapped_entry *entry; ++ unsigned long index, flags; ++ ++ if (!prefetch_enabled()) ++ goto out; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (swapped.count >= swapped.maxcount) { ++ /* ++ * Once the number of entries exceeds maxcount we start ++ * removing the least recently used entries. ++ */ ++ entry = list_entry(swapped.list.next, ++ struct swapped_entry, swapped_list); ++ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); ++ list_del(&entry->swapped_list); ++ swapped.count--; ++ } else { ++ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); ++ if (unlikely(!entry)) ++ /* bad, can't allocate more mem */ ++ goto out_locked; ++ } ++ ++ index = page_private(page); ++ entry->swp_entry.val = index; ++ /* ++ * On numa we need to store the node id to ensure that we prefetch to ++ * the same node it came from. ++ */ ++ store_swap_entry_node(entry, page); ++ ++ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { ++ list_add(&entry->swapped_list, &swapped.list); ++ swapped.count++; ++ } else ++ kmem_cache_free(swapped.cache, entry); ++ ++out_locked: ++ spin_unlock_irqrestore(&swapped.lock, flags); ++out: ++ if (!kprefetchd_awake) ++ wake_up_process(kprefetchd_task); ++ return; ++} ++ ++/* ++ * Removes entries from the swapped_list. The radix tree allows us to quickly ++ * look up the entry from the index without having to iterate over the whole ++ * list. ++ */ ++static void remove_from_swapped_list(const unsigned long index) ++{ ++ struct swapped_entry *entry; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ entry = radix_tree_delete(&swapped.swap_tree, index); ++ if (likely(entry)) { ++ list_del(&entry->swapped_list); ++ swapped.count--; ++ kmem_cache_free(swapped.cache, entry); ++ } ++ spin_unlock_irqrestore(&swapped.lock, flags); ++} ++ ++enum trickle_return { ++ TRICKLE_SUCCESS, ++ TRICKLE_FAILED, ++ TRICKLE_DELAY, ++}; ++ ++struct node_stats { ++ /* Free ram after a cycle of prefetching */ ++ unsigned long last_free; ++ /* Free ram on this cycle of checking prefetch_suitable */ ++ unsigned long current_free; ++ /* The amount of free ram before we start prefetching */ ++ unsigned long highfree[MAX_NR_ZONES]; ++ /* The amount of free ram where we will stop prefetching */ ++ unsigned long lowfree[MAX_NR_ZONES]; ++ /* highfree or lowfree depending on whether we've hit a watermark */ ++ unsigned long *pointfree[MAX_NR_ZONES]; ++}; ++ ++/* ++ * prefetch_stats stores the free ram data of each node and this is used to ++ * determine if a node is suitable for prefetching into. ++ */ ++struct prefetch_stats { ++ /* Which nodes are currently suited to prefetching */ ++ nodemask_t prefetch_nodes; ++ /* Total pages we've prefetched on this wakeup of kprefetchd */ ++ unsigned long prefetched_pages; ++ struct node_stats node[MAX_NUMNODES]; ++}; ++ ++static struct prefetch_stats sp_stat; ++ ++/* ++ * This tries to read a swp_entry_t into swap cache for swap prefetching. ++ * If it returns TRICKLE_DELAY we should delay further prefetching. ++ */ ++static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, ++ const int node) ++{ ++ enum trickle_return ret = TRICKLE_FAILED; ++ unsigned long flags; ++ struct page *page; ++ ++ read_lock_irqsave(&swapper_space.tree_lock, flags); ++ /* Entry may already exist */ ++ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); ++ read_unlock_irqrestore(&swapper_space.tree_lock, flags); ++ if (page) ++ goto out; ++ ++ /* ++ * Get a new page to read from swap. We have already checked the ++ * watermarks so __alloc_pages will not call on reclaim. ++ */ ++ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); ++ if (unlikely(!page)) { ++ ret = TRICKLE_DELAY; ++ goto out; ++ } ++ ++ if (add_to_swap_cache(page, entry)) { ++ /* Failed to add to swap cache */ ++ goto out_release; ++ } ++ ++ /* Add them to the tail of the inactive list to preserve LRU order */ ++ lru_cache_add_tail(page); ++ if (unlikely(swap_readpage(NULL, page))) ++ goto out_release; ++ ++ sp_stat.prefetched_pages++; ++ sp_stat.node[node].last_free--; ++ ++ ret = TRICKLE_SUCCESS; ++out_release: ++ page_cache_release(page); ++out: ++ /* ++ * All entries are removed here lazily. This avoids the cost of ++ * remove_from_swapped_list during normal swapin. Thus there are ++ * usually many stale entries. ++ */ ++ remove_from_swapped_list(entry.val); ++ return ret; ++} ++ ++static void clear_last_prefetch_free(void) ++{ ++ int node; ++ ++ /* ++ * Reset the nodes suitable for prefetching to all nodes. We could ++ * update the data to take into account memory hotplug if desired.. ++ */ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->last_free = 0; ++ } ++} ++ ++static void clear_current_prefetch_free(void) ++{ ++ int node; ++ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->current_free = 0; ++ } ++} ++ ++/* ++ * This updates the high and low watermarks of amount of free ram in each ++ * node used to start and stop prefetching. We prefetch from pages_high * 4 ++ * down to pages_high * 3. ++ */ ++static void examine_free_limits(void) ++{ ++ struct zone *z; ++ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ ns = &sp_stat.node[zone_to_nid(z)]; ++ idx = zone_idx(z); ++ ns->lowfree[idx] = z->pages_high * 3; ++ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; ++ ++ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { ++ /* ++ * We've gotten above the high watermark of free pages ++ * so we can start prefetching till we get to the low ++ * watermark. ++ */ ++ ns->pointfree[idx] = &ns->lowfree[idx]; ++ } ++ } ++} ++ ++/* ++ * We want to be absolutely certain it's ok to start prefetching. ++ */ ++static enum trickle_return prefetch_suitable(void) ++{ ++ enum trickle_return ret = TRICKLE_DELAY; ++ struct zone *z; ++ int node; ++ ++ /* ++ * If swap_prefetch is set to a high value we can ignore load ++ * and prefetch whenever we can. Otherwise we test for vm and ++ * cpu activity. ++ */ ++ if (swap_prefetch < 3) { ++ /* Purposefully racy, may return false positive */ ++ if (test_bit(0, &swapped.busy)) { ++ __clear_bit(0, &swapped.busy); ++ goto out; ++ } ++ ++ /* ++ * above_background_load is expensive so we only perform it ++ * every SWAP_CLUSTER_MAX prefetched_pages. ++ * We test to see if we're above_background_load as disk ++ * activity even at low priority can cause interrupt induced ++ * scheduling latencies. ++ */ ++ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) && ++ above_background_load()) ++ goto out; ++ } ++ clear_current_prefetch_free(); ++ ++ /* ++ * Have some hysteresis between where page reclaiming and prefetching ++ * will occur to prevent ping-ponging between them. ++ */ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ unsigned long free; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ node = zone_to_nid(z); ++ ns = &sp_stat.node[node]; ++ idx = zone_idx(z); ++ ++ free = zone_page_state(z, NR_FREE_PAGES); ++ if (free < *ns->pointfree[idx]) { ++ /* ++ * Free pages have dropped below the low watermark so ++ * we won't start prefetching again till we hit the ++ * high watermark of free pages. ++ */ ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ ns->current_free += free; ++ } ++ ++ /* ++ * We iterate over each node testing to see if it is suitable for ++ * prefetching and clear the nodemask if it is not. ++ */ ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ /* ++ * We check to see that pages are not being allocated ++ * elsewhere at any significant rate implying any ++ * degree of memory pressure (eg during file reads) ++ */ ++ if (ns->last_free) { ++ if (ns->current_free + SWAP_CLUSTER_MAX < ++ ns->last_free) { ++ ns->last_free = ns->current_free; ++ node_clear(node, ++ sp_stat.prefetch_nodes); ++ continue; ++ } ++ } else ++ ns->last_free = ns->current_free; ++ ++ /* We shouldn't prefetch when we are doing writeback */ ++ if (node_page_state(node, NR_WRITEBACK)) ++ node_clear(node, sp_stat.prefetch_nodes); ++ } ++ ++ /* Nothing suitable, put kprefetchd back to sleep */ ++ if (nodes_empty(sp_stat.prefetch_nodes)) ++ return TRICKLE_FAILED; ++ ++ /* Survived all that? Hooray we can prefetch! */ ++ ret = TRICKLE_SUCCESS; ++out: ++ return ret; ++} ++ ++/* ++ * trickle_swap is the main function that initiates the swap prefetching. It ++ * first checks to see if the busy flag is set, and does not prefetch if it ++ * is, as the flag implied we are low on memory or swapping in currently. ++ * Otherwise it runs until prefetch_suitable fails which occurs when the ++ * vm is busy, we prefetch to the watermark, the list is empty or we have ++ * iterated over all entries once. ++ */ ++static enum trickle_return trickle_swap(void) ++{ ++ enum trickle_return suitable, ret = TRICKLE_DELAY; ++ struct swapped_entry *pos, *n; ++ unsigned long flags; ++ ++ if (!prefetch_enabled()) ++ return ret; ++ ++ examine_free_limits(); ++ suitable = prefetch_suitable(); ++ if (suitable != TRICKLE_SUCCESS) ++ return suitable; ++ if (list_empty(&swapped.list)) { ++ kprefetchd_awake = 0; ++ return TRICKLE_FAILED; ++ } ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) { ++ swp_entry_t swp_entry; ++ int node; ++ ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ cond_resched(); ++ suitable = prefetch_suitable(); ++ if (suitable != TRICKLE_SUCCESS) { ++ ret = suitable; ++ goto out_unlocked; ++ } ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (unlikely(!pos)) ++ continue; ++ node = get_swap_entry_node(pos); ++ if (!node_isset(node, sp_stat.prefetch_nodes)) { ++ /* ++ * We found an entry that belongs to a node that is ++ * not suitable for prefetching so skip it. ++ */ ++ continue; ++ } ++ swp_entry = pos->swp_entry; ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) ++ goto out_unlocked; ++ spin_lock_irqsave(&swapped.lock, flags); ++ } ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++out_unlocked: ++ if (sp_stat.prefetched_pages) { ++ lru_add_drain(); ++ sp_stat.prefetched_pages = 0; ++ } ++ return ret; ++} ++ ++static int kprefetchd(void *__unused) ++{ ++ struct sched_param param = { .sched_priority = 0 }; ++ ++ sched_setscheduler(current, SCHED_BATCH, ¶m); ++ set_user_nice(current, 19); ++ /* Set ioprio to lowest if supported by i/o scheduler */ ++ sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE); ++ ++ while (!kthread_should_stop()) { ++ try_to_freeze(); ++ ++ if (!kprefetchd_awake) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ kprefetchd_awake = 1; ++ } ++ ++ if (trickle_swap() == TRICKLE_FAILED) ++ schedule_timeout_interruptible(PREFETCH_SLEEP); ++ else ++ schedule_timeout_interruptible(PREFETCH_DELAY); ++ clear_last_prefetch_free(); ++ } ++ return 0; ++} ++ ++/* ++ * Create kmem cache for swapped entries ++ */ ++void __init prepare_swap_prefetch(void) ++{ ++ struct zone *zone; ++ ++ swapped.cache = kmem_cache_create("swapped_entry", ++ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); ++ ++ /* ++ * We set the limit to more entries than the physical ram. ++ * We remove entries lazily so we need some headroom. ++ */ ++ swapped.maxcount = nr_free_pagecache_pages() * 2; ++ ++ for_each_zone(zone) { ++ struct node_stats *ns; ++ int idx; ++ ++ if (!populated_zone(zone)) ++ continue; ++ ++ ns = &sp_stat.node[zone_to_nid(zone)]; ++ idx = zone_idx(zone); ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ } ++} ++ ++static int __init kprefetchd_init(void) ++{ ++ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); ++ ++ return 0; ++} ++ ++static void __exit kprefetchd_exit(void) ++{ ++ kthread_stop(kprefetchd_task); ++} ++ ++module_init(kprefetchd_init); ++module_exit(kprefetchd_exit); +Index: linux-2.6.22-ck1/mm/swap_state.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/swap_state.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/swap_state.c 2007-07-10 14:55:22.000000000 +1000 +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -95,7 +96,7 @@ + return error; + } + +-static int add_to_swap_cache(struct page *page, swp_entry_t entry) ++int add_to_swap_cache(struct page *page, swp_entry_t entry) + { + int error; + +@@ -148,6 +149,9 @@ + swp_entry_t entry; + int err; + ++ /* Swap prefetching is delayed if we're swapping pages */ ++ delay_swap_prefetch(); ++ + BUG_ON(!PageLocked(page)); + + for (;;) { +@@ -320,6 +324,9 @@ + struct page *found_page, *new_page = NULL; + int err; + ++ /* Swap prefetching is delayed if we're already reading from swap */ ++ delay_swap_prefetch(); ++ + do { + /* + * First check the swap cache. Since this is normally +Index: linux-2.6.22-ck1/mm/vmscan.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/vmscan.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/vmscan.c 2007-07-10 14:55:23.000000000 +1000 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -36,6 +37,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -63,7 +65,7 @@ + * whole list at once. */ + int swap_cluster_max; + +- int swappiness; ++ int mapped; + + int all_unreclaimable; + }; +@@ -110,9 +112,10 @@ + #endif + + /* +- * From 0 .. 100. Higher means more swappy. ++ * From 0 .. 100. Lower means more swappy. + */ +-int vm_swappiness = 60; ++int vm_mapped __read_mostly = 66; ++int vm_hardmaplimit __read_mostly = 1; + long vm_total_pages; /* The total number of pages which the VM controls */ + + static LIST_HEAD(shrinker_list); +@@ -803,10 +806,14 @@ + * The distress ratio is important - we don't want to start + * going oom. + * +- * A 100% value of vm_swappiness overrides this algorithm +- * altogether. ++ * This distress value is ignored if we apply a hardmaplimit except ++ * in extreme distress. ++ * ++ * A 0% value of vm_mapped overrides this algorithm altogether. + */ +- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; ++ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); ++ if (!vm_hardmaplimit || distress == 100) ++ swap_tendency += distress; + + /* + * Now use this metric to decide whether to start moving mapped +@@ -955,6 +962,41 @@ + } + + /* ++ * Helper functions to adjust nice level of kswapd, based on the priority of ++ * the task (p) that called it. If it is already higher priority we do not ++ * demote its nice level since it is still working on behalf of a higher ++ * priority task. With kernel threads we leave it at nice 0. ++ * ++ * We don't ever run kswapd real time, so if a real time task calls kswapd we ++ * set it to highest SCHED_NORMAL priority. ++ */ ++static int effective_sc_prio(struct task_struct *p) ++{ ++ if (likely(p->mm)) { ++ if (rt_task(p)) ++ return -20; ++ if (idleprio_task(p)) ++ return 19; ++ return task_nice(p); ++ } ++ return 0; ++} ++ ++static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, ++ int active) ++{ ++ long nice = effective_sc_prio(p); ++ ++ if (task_nice(kswapd) > nice || !active) ++ set_user_nice(kswapd, nice); ++} ++ ++static int sc_priority(struct task_struct *p) ++{ ++ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); ++} ++ ++/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. +@@ -1011,7 +1053,8 @@ + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + */ +-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) ++unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ++ struct task_struct *p) + { + int priority; + int ret = 0; +@@ -1019,15 +1062,20 @@ + unsigned long nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages = 0; +- int i; ++ int i, scan_priority = DEF_PRIORITY; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + ++ if (p) ++ scan_priority = sc_priority(p); ++ ++ delay_swap_prefetch(); ++ + count_vm_event(ALLOCSTALL); + + for (i = 0; zones[i] != NULL; i++) { +@@ -1040,7 +1088,7 @@ + + zone_page_state(zone, NR_INACTIVE); + } + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + sc.nr_scanned = 0; + if (!priority) + disable_swap_token(); +@@ -1070,7 +1118,7 @@ + } + + /* Take a nap, wait for some writeback to complete */ +- if (sc.nr_scanned && priority < DEF_PRIORITY - 2) ++ if (sc.nr_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + } + /* top priority shrink_caches still had more to do? don't OOM, then */ +@@ -1120,9 +1168,9 @@ + */ + static unsigned long balance_pgdat(pg_data_t *pgdat, int order) + { +- int all_zones_ok; ++ int all_zones_ok = 0; + int priority; +- int i; ++ int i, scan_priority; + unsigned long total_scanned; + unsigned long nr_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; +@@ -1130,7 +1178,7 @@ + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + /* + * temp_priority is used to remember the scanning priority at which +@@ -1138,6 +1186,8 @@ + */ + int temp_priority[MAX_NR_ZONES]; + ++ scan_priority = sc_priority(pgdat->kswapd); ++ + loop_again: + total_scanned = 0; + nr_reclaimed = 0; +@@ -1145,9 +1195,9 @@ + count_vm_event(PAGEOUTRUN); + + for (i = 0; i < pgdat->nr_zones; i++) +- temp_priority[i] = DEF_PRIORITY; ++ temp_priority[i] = scan_priority; + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + +@@ -1163,15 +1213,22 @@ + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, +- 0, 0)) { ++ /* ++ * The watermark is relaxed depending on the ++ * level of "priority" till it drops to ++ * pages_high. ++ */ ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { + end_zone = i; + break; + } +@@ -1198,14 +1255,18 @@ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int nr_slab; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ ++ if (!zone_watermark_ok(zone, order, watermark, + end_zone, 0)) + all_zones_ok = 0; + temp_priority[i] = priority; +@@ -1238,7 +1299,7 @@ + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ +- if (total_scanned && priority < DEF_PRIORITY - 2) ++ if (total_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + + /* +@@ -1272,6 +1333,8 @@ + return nr_reclaimed; + } + ++#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ ++ + /* + * The background pageout daemon, started as a kernel thread + * from the init process. +@@ -1319,6 +1382,8 @@ + for ( ; ; ) { + unsigned long new_order; + ++ /* kswapd has been busy so delay watermark_timer */ ++ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; +@@ -1332,6 +1397,7 @@ + if (!freezing(current)) + schedule(); + ++ set_user_nice(tsk, 0); + order = pgdat->kswapd_max_order; + } + finish_wait(&pgdat->kswapd_wait, &wait); +@@ -1349,9 +1415,10 @@ + /* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +-void wakeup_kswapd(struct zone *zone, int order) ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) + { + pg_data_t *pgdat; ++ int active; + + if (!populated_zone(zone)) + return; +@@ -1363,7 +1430,9 @@ + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; +- if (!waitqueue_active(&pgdat->kswapd_wait)) ++ active = waitqueue_active(&pgdat->kswapd_wait); ++ set_kswapd_nice(pgdat->kswapd, p, active); ++ if (!active) + return; + wake_up_interruptible(&pgdat->kswapd_wait); + } +@@ -1382,6 +1451,8 @@ + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + ++ delay_swap_prefetch(); ++ + for_each_zone(zone) { + + if (!populated_zone(zone)) +@@ -1441,7 +1512,7 @@ + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + + current->reclaim_state = &reclaim_state; +@@ -1476,7 +1547,7 @@ + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; +- sc.swappiness = 100; ++ sc.mapped = 0; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { +@@ -1540,20 +1611,57 @@ + } + + /* ++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots ++ */ ++static void watermark_wakeup(unsigned long data) ++{ ++ pg_data_t *pgdat = (pg_data_t *)data; ++ struct timer_list *wt = &pgdat->watermark_timer; ++ int i; ++ ++ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) ++ goto out; ++ for (i = pgdat->nr_zones - 1; i >= 0; i--) { ++ struct zone *z = pgdat->node_zones + i; ++ ++ if (!populated_zone(z) || is_highmem(z)) { ++ /* We are better off leaving highmem full */ ++ continue; ++ } ++ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { ++ wake_up_interruptible(&pgdat->kswapd_wait); ++ goto out; ++ } ++ } ++out: ++ mod_timer(wt, jiffies + WT_EXPIRY); ++ return; ++} ++ ++/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ + int kswapd_run(int nid) + { + pg_data_t *pgdat = NODE_DATA(nid); ++ struct timer_list *wt; + int ret = 0; + + if (pgdat->kswapd) + return 0; + ++ wt = &pgdat->watermark_timer; ++ init_timer(wt); ++ wt->data = (unsigned long)pgdat; ++ wt->function = watermark_wakeup; ++ wt->expires = jiffies + WT_EXPIRY; ++ add_timer(wt); ++ + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ ++ del_timer(wt); + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; +@@ -1624,7 +1732,7 @@ + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + unsigned long slab_reclaimable; + +Index: linux-2.6.22-ck1/include/linux/mm_inline.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/mm_inline.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/mm_inline.h 2007-07-10 14:55:22.000000000 +1000 +@@ -13,6 +13,13 @@ + } + + static inline void ++add_page_to_inactive_list_tail(struct zone *zone, struct page *page) ++{ ++ list_add_tail(&page->lru, &zone->inactive_list); ++ __inc_zone_state(zone, NR_INACTIVE); ++} ++ ++static inline void + del_page_from_active_list(struct zone *zone, struct page *page) + { + list_del(&page->lru); +Index: linux-2.6.22-ck1/include/linux/swap-prefetch.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.22-ck1/include/linux/swap-prefetch.h 2007-07-10 14:55:22.000000000 +1000 +@@ -0,0 +1,53 @@ ++#ifndef SWAP_PREFETCH_H_INCLUDED ++#define SWAP_PREFETCH_H_INCLUDED ++ ++#ifdef CONFIG_SWAP_PREFETCH ++/* mm/swap_prefetch.c */ ++extern int swap_prefetch; ++extern int swap_prefetch_delay; ++extern int swap_prefetch_sleep; ++ ++struct swapped_entry { ++ swp_entry_t swp_entry; /* The actual swap entry */ ++ struct list_head swapped_list; /* Linked list of entries */ ++#if MAX_NUMNODES > 1 ++ int node; /* Node id */ ++#endif ++} __attribute__((packed)); ++ ++static inline void store_swap_entry_node(struct swapped_entry *entry, ++ struct page *page) ++{ ++#if MAX_NUMNODES > 1 ++ entry->node = page_to_nid(page); ++#endif ++} ++ ++static inline int get_swap_entry_node(struct swapped_entry *entry) ++{ ++#if MAX_NUMNODES > 1 ++ return entry->node; ++#else ++ return 0; ++#endif ++} ++ ++extern void add_to_swapped_list(struct page *page); ++extern void delay_swap_prefetch(void); ++extern void prepare_swap_prefetch(void); ++ ++#else /* CONFIG_SWAP_PREFETCH */ ++static inline void add_to_swapped_list(struct page *__unused) ++{ ++} ++ ++static inline void prepare_swap_prefetch(void) ++{ ++} ++ ++static inline void delay_swap_prefetch(void) ++{ ++} ++#endif /* CONFIG_SWAP_PREFETCH */ ++ ++#endif /* SWAP_PREFETCH_H_INCLUDED */ +Index: linux-2.6.22-ck1/mm/page_io.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/page_io.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/page_io.c 2007-07-10 14:55:22.000000000 +1000 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + + static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, +@@ -118,6 +119,7 @@ + ret = -ENOMEM; + goto out; + } ++ add_to_swapped_list(page); + if (wbc->sync_mode == WB_SYNC_ALL) + rw |= (1 << BIO_RW_SYNC); + count_vm_event(PSWPOUT); +Index: linux-2.6.22-ck1/include/linux/sysctl.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/sysctl.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/sysctl.h 2007-07-10 14:55:22.000000000 +1000 +@@ -190,7 +190,7 @@ + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ +- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ ++ VM_UNUSED19=19, /* was: Tendency to steal mapped memory */ + VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ + VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ +Index: linux-2.6.22-ck1/include/linux/mmzone.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/mmzone.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/mmzone.h 2007-07-10 14:55:23.000000000 +1000 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -181,7 +182,7 @@ + + struct zone { + /* Fields commonly accessed by the page allocator */ +- unsigned long pages_min, pages_low, pages_high; ++ unsigned long pages_min, pages_low, pages_high, pages_lots; + /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several +@@ -452,6 +453,7 @@ + wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; + int kswapd_max_order; ++ struct timer_list watermark_timer; + } pg_data_t; + + #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) +@@ -468,7 +470,7 @@ + void get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free); + void build_all_zonelists(void); +-void wakeup_kswapd(struct zone *zone, int order); ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); + int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); + enum memmap_context { +Index: linux-2.6.22-ck1/mm/page_alloc.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/page_alloc.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/mm/page_alloc.c 2007-07-10 14:55:22.000000000 +1000 +@@ -1250,7 +1250,7 @@ + goto nopage; + + for (z = zonelist->zones; *z; z++) +- wakeup_kswapd(*z, order); ++ wakeup_kswapd(*z, order, p); + + /* + * OK, we're below the kswapd watermark and have kicked background +@@ -1314,7 +1314,7 @@ + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + +- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); ++ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; +@@ -1570,6 +1570,7 @@ + " min:%lukB" + " low:%lukB" + " high:%lukB" ++ " lots:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" +@@ -1581,6 +1582,7 @@ + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), ++ K(zone->pages_lots), + K(zone_page_state(zone, NR_ACTIVE)), + K(zone_page_state(zone, NR_INACTIVE)), + K(zone->present_pages), +@@ -3142,6 +3144,7 @@ + + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); ++ zone->pages_lots = zone->pages_min + tmp; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + +Index: linux-2.6.22-ck1/fs/buffer.c +=================================================================== +--- linux-2.6.22-ck1.orig/fs/buffer.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/fs/buffer.c 2007-07-10 14:55:22.000000000 +1000 +@@ -356,7 +356,7 @@ + for_each_online_pgdat(pgdat) { + zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; + if (*zones) +- try_to_free_pages(zones, GFP_NOFS); ++ try_to_free_pages(zones, GFP_NOFS, NULL); + } + } + +Index: linux-2.6.22-ck1/mm/filemap.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/filemap.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/mm/filemap.c 2007-07-10 14:55:23.000000000 +1000 +@@ -466,6 +466,16 @@ + return ret; + } + ++int add_to_page_cache_lru_tail(struct page *page, ++ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache(page, mapping, offset, gfp_mask); ++ ++ if (ret == 0) ++ lru_cache_add_tail(page); ++ return ret; ++} ++ + #ifdef CONFIG_NUMA + struct page *__page_cache_alloc(gfp_t gfp) + { +@@ -839,6 +849,34 @@ + ra->ra_pages /= 4; + } + ++/* ++ * Sysctl which determines whether we should read from large files to the ++ * tail of the inactive lru list. ++ */ ++int vm_tail_largefiles __read_mostly = 1; ++ ++static inline int nr_mapped(void) ++{ ++ return global_page_state(NR_FILE_MAPPED) + ++ global_page_state(NR_ANON_PAGES); ++} ++ ++/* ++ * This examines how large in pages a file size is and returns 1 if it is ++ * more than half the unmapped ram. Avoid doing read_page_state which is ++ * expensive unless we already know it is likely to be large enough. ++ */ ++static int large_isize(unsigned long nr_pages) ++{ ++ if (nr_pages * 6 > vm_total_pages) { ++ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); ++ ++ if (nr_pages * 2 > unmapped_ram) ++ return 1; ++ } ++ return 0; ++} ++ + /** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read +@@ -1051,8 +1089,19 @@ + goto out; + } + } +- error = add_to_page_cache_lru(cached_page, mapping, +- index, GFP_KERNEL); ++ ++ /* ++ * If we know the file is large we add the pages read to the ++ * end of the lru as we're unlikely to be able to cache the ++ * whole file in ram so make those pages the first to be ++ * dropped if not referenced soon. ++ */ ++ if (vm_tail_largefiles && large_isize(end_index)) ++ error = add_to_page_cache_lru_tail(cached_page, ++ mapping, index, GFP_KERNEL); ++ else ++ error = add_to_page_cache_lru(cached_page, mapping, ++ index, GFP_KERNEL); + if (error) { + if (error == -EEXIST) + goto find_page; +Index: linux-2.6.22-ck1/Documentation/filesystems/proc.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/filesystems/proc.txt 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/filesystems/proc.txt 2007-07-10 14:55:23.000000000 +1000 +@@ -1333,6 +1333,14 @@ + As this is a non-destructive operation and dirty objects are not freeable, the + user should run `sync' first. + ++tail_largefiles ++--------------- ++ ++When enabled reads from large files to the tail end of the inactive lru list. ++This means that any cache from reading large files is dropped very quickly, ++preventing loss of mapped ram and useful pagecache when large files are read. ++This does, however, make caching less effective when working with large files. ++ + + 2.5 /proc/sys/dev - Device specific parameters + ---------------------------------------------- +Index: linux-2.6.22-ck1/arch/i386/Kconfig +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/Kconfig 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/Kconfig 2007-07-10 14:55:23.000000000 +1000 +@@ -550,7 +550,7 @@ + + choice + depends on EXPERIMENTAL +- prompt "Memory split" if EMBEDDED ++ prompt "Memory split" + default VMSPLIT_3G + help + Select the desired split between kernel and user memory. +@@ -569,17 +569,17 @@ + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !HIGHMEM +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !HIGHMEM +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +Index: linux-2.6.22-ck1/kernel/Kconfig.hz +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/Kconfig.hz 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/Kconfig.hz 2007-07-10 14:55:24.000000000 +1000 +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_1000 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -13,8 +13,7 @@ + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts +- per second. +- ++ per second.Laptops may also show improved battery life. + + config HZ_100 + bool "100 HZ" +@@ -23,13 +22,14 @@ + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEFAULT + bool "250 HZ" + help +- 250 Hz is a good compromise choice allowing server performance +- while also showing good interactive responsiveness even +- on SMP and NUMA systems. If you are going to be using NTSC video +- or multimedia, selected 300Hz instead. ++ 250 HZ is a lousy compromise choice allowing server interactivity ++ while also showing desktop throughput and no extra power saving on ++ laptops. Good for when you can't make up your mind. ++ ++ Recommend 100 or 1000 instead. + + config HZ_300 + bool "300 HZ" +@@ -45,12 +45,76 @@ + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + ++ config HZ_1500 ++ bool "1500 HZ" ++ help ++ 1500 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_2000 ++ bool "2000 HZ" ++ help ++ 2000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_3000 ++ bool "3000 HZ" ++ help ++ 3000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_4000 ++ bool "4000 HZ" ++ help ++ 4000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_5000 ++ bool "5000 HZ" ++ help ++ 5000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_7500 ++ bool "7500 HZ" ++ help ++ 7500 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_10000 ++ bool "10000 HZ" ++ help ++ 10000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ + endchoice + + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 ++ default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 1500 if HZ_1500 ++ default 2000 if HZ_2000 ++ default 3000 if HZ_3000 ++ default 4000 if HZ_4000 ++ default 5000 if HZ_5000 ++ default 7500 if HZ_7500 ++ default 10000 if HZ_10000 + +Index: linux-2.6.22-ck1/arch/i386/defconfig +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/defconfig 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/defconfig 2007-07-10 14:55:23.000000000 +1000 +@@ -226,10 +226,10 @@ + # CONFIG_IRQBALANCE is not set + CONFIG_SECCOMP=y + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + # CONFIG_KEXEC is not set + # CONFIG_CRASH_DUMP is not set + CONFIG_PHYSICAL_START=0x100000 +Index: linux-2.6.22-ck1/arch/x86_64/defconfig +=================================================================== +--- linux-2.6.22-ck1.orig/arch/x86_64/defconfig 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/x86_64/defconfig 2007-07-10 14:55:23.000000000 +1000 +@@ -185,10 +185,10 @@ + CONFIG_SECCOMP=y + # CONFIG_CC_STACKPROTECTOR is not set + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + CONFIG_K8_NB=y + CONFIG_GENERIC_HARDIRQS=y + CONFIG_GENERIC_IRQ_PROBE=y +Index: linux-2.6.22-ck1/include/linux/jiffies.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/jiffies.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/jiffies.h 2007-07-10 14:55:24.000000000 +1000 +@@ -29,6 +29,12 @@ + # define SHIFT_HZ 9 + #elif HZ >= 768 && HZ < 1536 + # define SHIFT_HZ 10 ++#elif HZ >= 1536 && HZ < 3072 ++# define SHIFT_HZ 11 ++#elif HZ >= 3072 && HZ < 6144 ++# define SHIFT_HZ 12 ++#elif HZ >= 6144 && HZ < 12288 ++# define SHIFT_HZ 13 + #else + # error You lose. + #endif +Index: linux-2.6.22-ck1/include/net/inet_timewait_sock.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/net/inet_timewait_sock.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/net/inet_timewait_sock.h 2007-07-10 14:55:24.000000000 +1000 +@@ -38,8 +38,8 @@ + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +-#if HZ <= 16 || HZ > 4096 +-# error Unsupported: HZ <= 16 or HZ > 4096 ++#if HZ <= 16 || HZ > 16384 ++# error Unsupported: HZ <= 16 or HZ > 16384 + #elif HZ <= 32 + # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 64 +@@ -54,8 +54,12 @@ + # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 2048 + # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +-#else ++#elif HZ <= 4096 + # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#elif HZ <= 8192 ++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#else ++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #endif + + /* TIME_WAIT reaping mechanism. */ +Index: linux-2.6.22-ck1/init/calibrate.c +=================================================================== +--- linux-2.6.22-ck1.orig/init/calibrate.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/init/calibrate.c 2007-07-10 14:55:24.000000000 +1000 +@@ -122,12 +122,12 @@ + printk("Calibrating delay loop (skipped)... " + "%lu.%02lu BogoMIPS preset\n", + loops_per_jiffy/(500000/HZ), +- (loops_per_jiffy/(5000/HZ)) % 100); ++ (loops_per_jiffy * 10/(50000/HZ)) % 100); + } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { + printk("Calibrating delay using timer specific routine.. "); + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), +- (loops_per_jiffy/(5000/HZ)) % 100, ++ (loops_per_jiffy * 10/(50000/HZ)) % 100, + loops_per_jiffy); + } else { + loops_per_jiffy = (1<<12); +@@ -166,7 +166,7 @@ + /* Round the value and print it */ + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), +- (loops_per_jiffy/(5000/HZ)) % 100, ++ (loops_per_jiffy * 10/(50000/HZ)) % 100, + loops_per_jiffy); + } + +Index: linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/kernel/cpu/proc.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c 2007-07-10 14:55:24.000000000 +1000 +@@ -157,7 +157,7 @@ + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); + + return 0; +Index: linux-2.6.22-ck1/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/kernel/smpboot.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/kernel/smpboot.c 2007-07-10 14:55:24.000000000 +1000 +@@ -1094,7 +1094,7 @@ + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), +- (bogosum/(5000/HZ))%100); ++ (bogosum * 10/(50000/HZ))%100); + + Dprintk("Before bogocount - setting activated=1.\n"); + +Index: linux-2.6.22-ck1/include/linux/nfsd/stats.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/nfsd/stats.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/nfsd/stats.h 2007-07-10 14:55:24.000000000 +1000 +@@ -35,8 +35,8 @@ + + }; + +-/* thread usage wraps very million seconds (approx one fortnight) */ +-#define NFSD_USAGE_WRAP (HZ*1000000) ++/* thread usage wraps every one hundred thousand seconds (approx one day) */ ++#define NFSD_USAGE_WRAP (HZ*100000) + + #ifdef __KERNEL__ + +Index: linux-2.6.22-ck1/arch/x86_64/kernel/setup.c +=================================================================== +--- linux-2.6.22-ck1.orig/arch/x86_64/kernel/setup.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/x86_64/kernel/setup.c 2007-07-10 14:55:24.000000000 +1000 +@@ -1047,7 +1047,7 @@ + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); +Index: linux-2.6.22-ck1/Makefile +=================================================================== +--- linux-2.6.22-ck1.orig/Makefile 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/Makefile 2007-07-10 14:55:24.000000000 +1000 +@@ -1,8 +1,9 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 22 +-EXTRAVERSION = +-NAME = Holy Dancing Manatees, Batman! ++EXTRAVERSION = -ck1 ++NAME = So long, and thanks for all the fish ++JANAME = さようなら、いままで魚をありがとう + + # *DOCUMENTATION* + # To see a list of typical targets execute "make help"