From e20c559e5f8d6d1d5bc6a15204fb1966950946cf Mon Sep 17 00:00:00 2001 From: Wouter den Breejen Date: Sat, 17 Nov 2007 16:29:15 +0000 Subject: [PATCH] I have added a special kernel 2.6.21 with the Con Kolivas patches that are designed to improve system responsiveness with specific emphasis on the desktop, but suitable to any workload. (and also the other patches like ext3cow and paravirt-nvidia) svn path=/nixpkgs/trunk/; revision=9748 --- .../linux/kernel/linux-2.6.21_ck.nix | 84 + .../os-specific/linux/kernel/patch-2.6.21-ck1 | 5040 ++++++++++++++++ .../os-specific/linux/kernel/patch-2.6.22-ck1 | 5167 +++++++++++++++++ 3 files changed, 10291 insertions(+) create mode 100644 pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix create mode 100644 pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 create mode 100644 pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 diff --git a/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix new file mode 100644 index 00000000000..55748d3e9f5 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/linux-2.6.21_ck.nix @@ -0,0 +1,84 @@ +{ stdenv, fetchurl, perl, mktemp, module_init_tools + + # A list of patches to apply to the kernel. Each element of this list + # should be an attribute set {name, patch} where `name' is a + # symbolic name and `patch' is the actual patch. The patch may + # optionally be compressed with gzip or bzip2. +, kernelPatches ? [] + +, # Whether to build a User-Mode Linux kernel. + userModeLinux ? false + +, # Allows you to set your own kernel version suffix (e.g., + # "-my-kernel"). + localVersion ? "" + +, # Your own kernel configuration file, if you don't want to use the + # default. + kernelConfig ? null + +, # A list of additional statements to be appended to the + # configuration file. + extraConfig ? [] +}: + +assert stdenv.system == "i686-linux" || stdenv.system == "x86_64-linux"; + +let + + lib = import ../../../lib; + + version = "2.6.21"; + +in + +stdenv.mkDerivation { + name = if userModeLinux then "user-mode-linux-${version}" else "linux-${version}"; + builder = ./builder.sh; + + src = fetchurl { + url = "http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.21.tar.bz2"; + sha256 = "f187b12d70e0a48ce81f0472dfe9504fb5f0f966be339ac9d57dd2b991a74942"; + }; + + patches = map (p: p.patch) kernelPatches; + extraConfig = + let addNewlines = map (s: "\n" + s + "\n"); + configFromPatches = + map (p: if p ? extraConfig then p.extraConfig else "") kernelPatches; + in lib.concatStrings (addNewlines (configFromPatches ++ extraConfig)); + + config = + if kernelConfig != null then kernelConfig else + if userModeLinux then ./config-2.6.21-uml else + if stdenv.system == "i686-linux" then ./config-2.6.21-i686-smp else + if stdenv.system == "x86_64-linux" then ./config-2.6.21-x86_64-smp else + abort "No kernel configuration for your platform!"; + + buildInputs = [perl mktemp]; + + arch = + if userModeLinux then "um" else + if stdenv.system == "i686-linux" then "i386" else + if stdenv.system == "x86_64-linux" then "x86_64" else + abort "Platform ${stdenv.system} is not supported."; + + makeFlags = if userModeLinux then "ARCH=um SHELL=bash" else ""; + + inherit module_init_tools; + + allowLocalVersion = false; # don't allow patches to set a suffix + inherit localVersion; # but do allow the user to set one. + + meta = { + description = + (if userModeLinux then + "User-Mode Linux" + else + "The Linux kernel") + + (if kernelPatches == [] then "" else + " (with patches: " + + lib.concatStrings (lib.intersperse ", " (map (x: x.name) kernelPatches)) + + ")"); + }; +} diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 new file mode 100644 index 00000000000..0bf63f5aca3 --- /dev/null +++ b/pkgs/os-specific/linux/kernel/patch-2.6.21-ck1 @@ -0,0 +1,5040 @@ +Index: linux-2.6.21-ck1/Makefile +=================================================================== +--- linux-2.6.21-ck1.orig/Makefile 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/Makefile 2007-05-04 12:21:37.000000000 +1000 +@@ -1,7 +1,7 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 21 +-EXTRAVERSION = ++EXTRAVERSION = -ck1 + NAME = Nocturnal Monster Puppy + + # *DOCUMENTATION* +Index: linux-2.6.21-ck1/kernel/workqueue.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/workqueue.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/workqueue.c 2007-05-04 12:10:54.000000000 +1000 +@@ -355,8 +355,6 @@ static int worker_thread(void *__cwq) + if (!cwq->freezeable) + current->flags |= PF_NOFREEZE; + +- set_user_nice(current, -5); +- + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); +Index: linux-2.6.21-ck1/fs/proc/array.c +=================================================================== +--- linux-2.6.21-ck1.orig/fs/proc/array.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/fs/proc/array.c 2007-05-04 12:10:54.000000000 +1000 +@@ -165,7 +165,6 @@ static inline char * task_state(struct t + rcu_read_lock(); + buffer += sprintf(buffer, + "State:\t%s\n" +- "SleepAVG:\t%lu%%\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" +@@ -173,7 +172,6 @@ static inline char * task_state(struct t + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), +- (p->sleep_avg/1024)*100/(1020000000/1024), + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, +Index: linux-2.6.21-ck1/include/linux/init_task.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/init_task.h 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/init_task.h 2007-05-04 12:24:19.000000000 +1000 +@@ -102,13 +102,15 @@ extern struct group_info init_groups; + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ ++ .rotation = 0, \ + .policy = SCHED_NORMAL, \ + .cpus_allowed = CPU_MASK_ALL, \ + .mm = NULL, \ + .active_mm = &init_mm, \ + .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .ioprio = 0, \ +- .time_slice = HZ, \ ++ .time_slice = 1000000000, \ ++ .quota = 1000000000, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ + .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ +@@ -135,6 +137,7 @@ extern struct group_info init_groups; + .signal = {{0}}}, \ + .blocked = {{0}}, \ + .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .mutexes_held = 0, \ + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .fs_excl = ATOMIC_INIT(0), \ +Index: linux-2.6.21-ck1/include/linux/sched.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/sched.h 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/sched.h 2007-05-04 12:24:19.000000000 +1000 +@@ -34,9 +34,14 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO 5 + + #ifdef __KERNEL__ + ++#define SCHED_MAX SCHED_IDLEPRIO ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++ + struct sched_param { + int sched_priority; + }; +@@ -149,8 +154,7 @@ extern unsigned long weighted_cpuload(co + #define EXIT_ZOMBIE 16 + #define EXIT_DEAD 32 + /* in tsk->state again */ +-#define TASK_NONINTERACTIVE 64 +-#define TASK_DEAD 128 ++#define TASK_DEAD 64 + + #define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +@@ -522,14 +526,19 @@ struct signal_struct { + + #define MAX_USER_RT_PRIO 100 + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#define PRIO_RANGE (40) ++#define ISO_PRIO (MAX_RT_PRIO - 1) + +-#define MAX_PRIO (MAX_RT_PRIO + 40) ++#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) + +-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_prio(prio) unlikely((prio) < ISO_PRIO) + #define rt_task(p) rt_prio((p)->prio) + #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) + #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++#define iso_task(p) unlikely((p)->policy == SCHED_ISO) ++#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) + + /* + * Some day this will be a full-fledged user tracking system.. +@@ -740,6 +749,22 @@ extern unsigned int max_cache_size; + + #endif /* CONFIG_SMP */ + ++/* ++ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of ++ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a ++ * task of nice 0 or enough lower priority tasks to bring up the ++ * weighted_cpuload ++ */ ++static inline int above_background_load(void) ++{ ++ unsigned long cpu; ++ ++ for_each_online_cpu(cpu) { ++ if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) ++ return 1; ++ } ++ return 0; ++} + + struct io_context; /* See blkdev.h */ + struct cpuset; +@@ -788,13 +813,6 @@ struct mempolicy; + struct pipe_inode_info; + struct uts_namespace; + +-enum sleep_type { +- SLEEP_NORMAL, +- SLEEP_NONINTERACTIVE, +- SLEEP_INTERACTIVE, +- SLEEP_INTERRUPTED, +-}; +- + struct prio_array; + + struct task_struct { +@@ -814,20 +832,33 @@ struct task_struct { + int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; + struct list_head run_list; ++ /* ++ * This bitmap shows what priorities this task has received quota ++ * from for this major priority rotation on its current runqueue. ++ */ ++ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); + struct prio_array *array; ++ /* Which major runqueue rotation did this task run */ ++ unsigned long rotation; + + unsigned short ioprio; + #ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; + #endif +- unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ +- enum sleep_type sleep_type; + + unsigned long policy; + cpumask_t cpus_allowed; +- unsigned int time_slice, first_time_slice; ++ /* ++ * How much this task is entitled to run at the current priority ++ * before being requeued at a lower priority. ++ */ ++ int time_slice; ++ /* Is this the very first time_slice this task has ever run. */ ++ unsigned int first_time_slice; ++ /* How much this task receives at each priority level */ ++ int quota; + + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + struct sched_info sched_info; +@@ -992,6 +1023,7 @@ struct task_struct { + struct held_lock held_locks[MAX_LOCK_DEPTH]; + unsigned int lockdep_recursion; + #endif ++ unsigned long mutexes_held; + + /* journalling filesystem info */ + void *journal_info; +@@ -1156,8 +1188,10 @@ static inline void put_task_struct(struc + #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ + #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ ++#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ + #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ + #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ ++#define PF_NONSLEEP 0x40000000 /* Waiting on in-kernel activity */ + + /* + * Only the _current_ task can read/write to tsk->flags, but other +Index: linux-2.6.21-ck1/kernel/sched.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/sched.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/sched.c 2007-05-04 12:24:22.000000000 +1000 +@@ -16,6 +16,7 @@ + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas + */ + + #include +@@ -52,6 +53,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -83,126 +85,85 @@ unsigned long long __attribute__((weak)) + #define USER_PRIO(p) ((p)-MAX_RT_PRIO) + #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) ++#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) + +-/* +- * Some helpers for converting nanosecond timing to jiffy resolution +- */ +-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) ++/* Some helpers for converting to/from various scales.*/ + #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +- +-/* +- * These are the 'tuning knobs' of the scheduler: +- * +- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), +- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. +- * Timeslices get refilled after they expire. +- */ +-#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +-#define DEF_TIMESLICE (100 * HZ / 1000) +-#define ON_RUNQUEUE_WEIGHT 30 +-#define CHILD_PENALTY 95 +-#define PARENT_PENALTY 100 +-#define EXIT_WEIGHT 3 +-#define PRIO_BONUS_RATIO 25 +-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +-#define INTERACTIVE_DELTA 2 +-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +-#define STARVATION_LIMIT (MAX_SLEEP_AVG) +-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +- +-/* +- * If a task is 'interactive' then we reinsert it in the active +- * array after it has expired its current timeslice. (it will not +- * continue to run immediately, it will still roundrobin with +- * other interactive tasks.) +- * +- * This part scales the interactivity limit depending on niceness. +- * +- * We scale it linearly, offset by the INTERACTIVE_DELTA delta. +- * Here are a few examples of different nice levels: +- * +- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] +- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] +- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] +- * +- * (the X axis represents the possible -5 ... 0 ... +5 dynamic +- * priority range a task can explore, a value of '1' means the +- * task is rated interactive.) +- * +- * Ie. nice +19 tasks can never get 'interactive' enough to be +- * reinserted into the active array. And only heavily CPU-hog nice -20 +- * tasks will be expired. Default nice 0 tasks are somewhere between, +- * it takes some effort for them to get interactive, but it's not +- * too hard. +- */ +- +-#define CURRENT_BONUS(p) \ +- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ +- MAX_SLEEP_AVG) +- +-#define GRANULARITY (10 * HZ / 1000 ? : 1) +- +-#ifdef CONFIG_SMP +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ +- num_online_cpus()) +-#else +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +-#endif +- +-#define SCALE(v1,v1_max,v2_max) \ +- (v1) * (v2_max) / (v1_max) +- +-#define DELTA(p) \ +- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ +- INTERACTIVE_DELTA) +- +-#define TASK_INTERACTIVE(p) \ +- ((p)->prio <= (p)->static_prio - DELTA(p)) +- +-#define INTERACTIVE_SLEEP(p) \ +- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ +- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +- +-#define TASK_PREEMPTS_CURR(p, rq) \ +- ((p)->prio < (rq)->curr->prio) +- +-#define SCALE_PRIO(x, prio) \ +- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) +- +-static unsigned int static_prio_timeslice(int static_prio) +-{ +- if (static_prio < NICE_TO_PRIO(0)) +- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); +- else +- return SCALE_PRIO(DEF_TIMESLICE, static_prio); +-} +- +-/* +- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] +- * to time slice values: [800ms ... 100ms ... 5ms] +- * +- * The higher a thread's priority, the bigger timeslices +- * it gets during one round of execution. But even the lowest +- * priority thread gets MIN_TIMESLICE worth of execution time. ++#define MS_TO_NS(TIME) ((TIME) * 1000000) ++#define MS_TO_US(TIME) ((TIME) * 1000) ++#define US_TO_MS(TIME) ((TIME) / 1000) ++ ++#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 8ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run (over ISO_PERIOD seconds) as real time tasks. ++ * sched_iso_period - sysctl which determines the number of seconds over ++ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are ++ * exceeding their allowable bandwidth. ++*/ ++int sched_iso_cpu __read_mostly = 80; ++int sched_iso_period __read_mostly = 5; ++ ++#define ISO_PERIOD ((sched_iso_period * HZ) + 1) ++ ++/* ++ * This contains a bitmap for each dynamic priority level with empty slots ++ * for the valid priorities each different nice level can have. It allows ++ * us to stagger the slots where differing priorities run in a way that ++ * keeps latency differences between different nice levels at a minimum. ++ * The purpose of a pre-generated matrix is for rapid lookup of next slot in ++ * O(1) time without having to recalculate every time priority gets demoted. ++ * All nice levels use priority slot 39 as this allows less niced tasks to ++ * get all priority slots better than that before expiration is forced. ++ * ie, where 0 means a slot for that priority, priority running from left to ++ * right is from prio 0 to prio 39: ++ * nice -20 0000000000000000000000000000000000000000 ++ * nice -10 1000100010001000100010001000100010010000 ++ * nice 0 1010101010101010101010101010101010101010 ++ * nice 5 1011010110110101101101011011010110110110 ++ * nice 10 1110111011101110111011101110111011101110 ++ * nice 15 1111111011111110111111101111111011111110 ++ * nice 19 1111111111111111111111111111111111111110 + */ ++static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] ++ __read_mostly; + +-static inline unsigned int task_timeslice(struct task_struct *p) +-{ +- return static_prio_timeslice(p->static_prio); +-} ++struct rq; + + /* + * These are the runqueue data structures: + */ +- + struct prio_array { +- unsigned int nr_active; +- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ +- struct list_head queue[MAX_PRIO]; ++ /* Tasks queued at each priority */ ++ struct list_head queue[MAX_PRIO + 1]; ++ ++ /* ++ * The bitmap of priorities queued for this array. While the expired ++ * array will never have realtime tasks on it, it is simpler to have ++ * equal sized bitmaps for a cheap array swap. Include 1 bit for ++ * delimiter. ++ */ ++ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); ++ ++ /* ++ * The best static priority (of the dynamic priority tasks) queued ++ * this array. ++ */ ++ int best_static_prio; ++ ++#ifdef CONFIG_SMP ++ /* For convenience looks back at rq */ ++ struct rq *rq; ++#endif + }; + + /* +@@ -234,14 +195,28 @@ struct rq { + */ + unsigned long nr_uninterruptible; + +- unsigned long expired_timestamp; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; +- struct prio_array *active, *expired, arrays[2]; +- int best_expired_prio; ++ ++ struct prio_array *active, *expired, *idleprio, arrays[2]; ++ unsigned long *dyn_bitmap, *exp_bitmap; ++ ++ /* ++ * The current dynamic priority level this runqueue is at per static ++ * priority level. ++ */ ++ int prio_level[PRIO_RANGE]; ++ ++ /* How many times we have rotated the priority queue */ ++ unsigned long prio_rotation; ++ unsigned long iso_ticks; ++ unsigned short iso_refractory; ++ ++ /* Number of idleprio tasks running */ ++ unsigned long nr_idleprio; + atomic_t nr_iowait; + + #ifdef CONFIG_SMP +@@ -579,12 +554,9 @@ static inline struct rq *this_rq_lock(vo + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + /* + * Called when a process is dequeued from the active array and given +- * the cpu. We should note that with the exception of interactive +- * tasks, the expired queue will become the active queue after the active +- * queue is empty, without explicitly dequeuing and requeuing tasks in the +- * expired queue. (Interactive tasks may be requeued directly to the +- * active queue, thus delaying tasks in the expired queue from running; +- * see scheduler_tick()). ++ * the cpu. We should note that the expired queue will become the active ++ * queue after the active queue is empty, without explicitly dequeuing and ++ * requeuing tasks in the expired queue. + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple +@@ -682,71 +654,304 @@ sched_info_switch(struct task_struct *pr + #define sched_info_switch(t, next) do { } while (0) + #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + ++static int idleprio_suitable(struct task_struct *p) ++{ ++ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && ++ !(p->flags & (PF_NONSLEEP | PF_EXITING))); ++} ++ ++static int idleprio(const struct task_struct *p) ++{ ++ return (p->prio == MAX_PRIO); ++} ++ ++static inline int task_queued(struct task_struct *task) ++{ ++ return !list_empty(&task->run_list); ++} ++ ++static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) ++{ ++ __set_bit(p->prio, p->array->prio_bitmap); ++} ++ + /* +- * Adding/removing a task to/from a priority array: ++ * Removing from a runqueue. + */ +-static void dequeue_task(struct task_struct *p, struct prio_array *array) ++static void dequeue_task(struct task_struct *p, struct rq *rq) + { +- array->nr_active--; +- list_del(&p->run_list); +- if (list_empty(array->queue + p->prio)) +- __clear_bit(p->prio, array->bitmap); ++ list_del_init(&p->run_list); ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio--; ++ else if (list_empty(p->array->queue + p->prio)) ++ __clear_bit(p->prio, p->array->prio_bitmap); + } + +-static void enqueue_task(struct task_struct *p, struct prio_array *array) ++static void reset_first_time_slice(struct task_struct *p) + { +- sched_info_queued(p); +- list_add_tail(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; ++ if (unlikely(p->first_time_slice)) ++ p->first_time_slice = 0; ++} ++ ++/* ++ * The task is being queued on a fresh array so it has its entitlement ++ * bitmap cleared. ++ */ ++static void task_new_array(struct task_struct *p, struct rq *rq, ++ struct prio_array *array) ++{ ++ bitmap_zero(p->bitmap, PRIO_RANGE); ++ p->rotation = rq->prio_rotation; ++ p->time_slice = p->quota; + p->array = array; ++ reset_first_time_slice(p); ++} ++ ++/* Find the first slot from the relevant prio_matrix entry */ ++static int first_prio_slot(struct task_struct *p) ++{ ++ if (unlikely(p->policy == SCHED_BATCH)) ++ return p->static_prio; ++ return SCHED_PRIO(find_first_zero_bit( ++ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); + } + + /* +- * Put task to the end of the run list without the overhead of dequeue +- * followed by enqueue. ++ * In sched_interactive mode priority allocation occurs per process per rq ++ * array swap. In !sched_interactive mode all waking tasks must obey the ++ * current prio level of all other tasks running per array swap. + */ +-static void requeue_task(struct task_struct *p, struct prio_array *array) ++static int minprio(struct rq *rq, int uprio) + { +- list_move_tail(&p->run_list, array->queue + p->prio); ++ if (sched_interactive) ++ return MAX_RT_PRIO; ++ return rq->prio_level[uprio]; + } + +-static inline void +-enqueue_task_head(struct task_struct *p, struct prio_array *array) ++/* ++ * Find the first unused slot by this task that is also in its prio_matrix ++ * level. SCHED_BATCH tasks do not use the priority matrix. They only take ++ * priority slots from their static_prio and above. ++ */ ++static int next_entitled_slot(struct task_struct *p, struct rq *rq) + { +- list_add(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; +- p->array = array; ++ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); ++ struct prio_array *array = rq->active; ++ DECLARE_BITMAP(tmp, PRIO_RANGE); ++ ++ /* ++ * Go straight to expiration if there are higher priority tasks ++ * already expired. ++ */ ++ if (p->static_prio > rq->expired->best_static_prio) ++ return MAX_PRIO; ++ if (!rq->prio_level[uprio]) ++ rq->prio_level[uprio] = MAX_RT_PRIO; ++ /* ++ * Only priorities equal to the prio_level and above for their ++ * static_prio are acceptable, and only if it's not better than ++ * a queued better static_prio's prio_level. ++ */ ++ if (p->static_prio < array->best_static_prio) { ++ if (likely(p->policy != SCHED_BATCH)) ++ array->best_static_prio = p->static_prio; ++ } else if (p->static_prio == array->best_static_prio) { ++ search_prio = minprio(rq, uprio); ++ } else { ++ int i; ++ ++ search_prio = minprio(rq, uprio); ++ /* A bound O(n) function, worst case n is 40 */ ++ for (i = array->best_static_prio; i <= p->static_prio ; i++) { ++ if (!rq->prio_level[USER_PRIO(i)]) ++ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; ++ search_prio = max(search_prio, ++ rq->prio_level[USER_PRIO(i)]); ++ } ++ } ++ if (unlikely(p->policy == SCHED_BATCH)) { ++ search_prio = max(search_prio, p->static_prio); ++ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++ } ++ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); ++ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++} ++ ++static void queue_expired(struct task_struct *p, struct rq *rq) ++{ ++ task_new_array(p, rq, rq->expired); ++ p->prio = p->normal_prio = first_prio_slot(p); ++ if (p->static_prio < rq->expired->best_static_prio) ++ rq->expired->best_static_prio = p->static_prio; ++ reset_first_time_slice(p); + } + ++#ifdef CONFIG_SMP + /* +- * __normal_prio - return the priority that is based on the static +- * priority but is modified by bonuses/penalties. +- * +- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] +- * into the -5 ... 0 ... +5 bonus/penalty range. +- * +- * We use 25% of the full 0...39 priority range so that: +- * +- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. +- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. +- * +- * Both properties are important to certain workloads. ++ * If we're waking up a task that was previously on a different runqueue, ++ * update its data appropriately. Note we may be reading data from src_rq-> ++ * outside of lock, but the occasional inaccurate result should be harmless. + */ ++ static void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++ struct rq *src_rq = p->array->rq; + +-static inline int __normal_prio(struct task_struct *p) ++ if (src_rq == rq) ++ return; ++ /* ++ * Only need to set p->array when p->rotation == rq->prio_rotation as ++ * they will be set in recalc_task_prio when != rq->prio_rotation. ++ */ ++ if (p->rotation == src_rq->prio_rotation) { ++ p->rotation = rq->prio_rotation; ++ if (p->array == src_rq->expired) ++ p->array = rq->expired; ++ else ++ p->array = rq->active; ++ } else ++ p->rotation = 0; ++} ++#else ++static inline void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++} ++#endif ++ ++static inline int isoprio_suitable(struct task_struct *p) + { +- int bonus, prio; ++ return !(p->flags & PF_ISOREF); ++} + +- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; ++static int task_timeslice(struct task_struct *p); + +- prio = p->static_prio - bonus; +- if (prio < MAX_RT_PRIO) +- prio = MAX_RT_PRIO; +- if (prio > MAX_PRIO-1) +- prio = MAX_PRIO-1; +- return prio; ++/* ++ * recalc_task_prio determines what priority a non rt_task will be ++ * queued at. If the task has already been running during this runqueue's ++ * major rotation (rq->prio_rotation) then it continues at the same ++ * priority if it has tick entitlement left. If it does not have entitlement ++ * left, it finds the next priority slot according to its nice value that it ++ * has not extracted quota from. If it has not run during this major ++ * rotation, it starts at the next_entitled_slot and has its bitmap quota ++ * cleared. If it does not have any slots left it has all its slots reset and ++ * is queued on the expired at its first_prio_slot. ++ */ ++static void recalc_task_prio(struct task_struct *p, struct rq *rq) ++{ ++ struct prio_array *array = rq->active; ++ int queue_prio; ++ ++ if (iso_task(p)) { ++ if (isoprio_suitable(p)) { ++ /* ++ * If SCHED_ISO tasks have not used up their real time ++ * quota they have run just better than highest ++ * SCHED_NORMAL priority. Otherwise they run as ++ * SCHED_NORMAL. ++ */ ++ p->prio = p->normal_prio = ISO_PRIO; ++ p->array = rq->active; ++ if (p->time_slice <= 0) ++ p->time_slice = p->quota; ++ return; ++ } else if (p->prio == ISO_PRIO) { ++ /* Just about to be demoted to SCHED_NORMAL */ ++ p->time_slice = 0; ++ } ++ } else if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ /* ++ * If suitable idleprio_tasks are queued at MAX_PRIO ++ * only on the idleprio array. Their time_slice is ++ * their full task_timeslice as they cooperatively ++ * multitask. ++ */ ++ p->prio = p->normal_prio = MAX_PRIO; ++ p->array = rq->idleprio; ++ if (p->time_slice <= 0) ++ p->time_slice = task_timeslice(p); ++ return; ++ } ++ /* ++ * If unsuitable idleprio_tasks are queued equivalent to ++ * nice 19 tasks on the expired array. ++ */ ++ p->flags &= ~PF_NONSLEEP; ++ p->prio = p->normal_prio = MAX_PRIO - 1; ++ p->array = rq->expired; ++ if (p->time_slice <= 0 || p->time_slice > p->quota) ++ p->time_slice = p->quota; ++ return; ++ } ++ ++ update_if_moved(p, rq); ++ if (p->rotation == rq->prio_rotation) { ++ if (p->array == array) { ++ if (p->time_slice > 0) ++ return; ++ p->time_slice = p->quota; ++ } else if (p->array == rq->expired) { ++ queue_expired(p, rq); ++ return; ++ } else ++ task_new_array(p, rq, array); ++ } else ++ task_new_array(p, rq, array); ++ ++ queue_prio = next_entitled_slot(p, rq); ++ if (queue_prio >= MAX_PRIO) { ++ queue_expired(p, rq); ++ return; ++ } ++ p->prio = p->normal_prio = queue_prio; ++ __set_bit(USER_PRIO(p->prio), p->bitmap); ++} ++ ++/* ++ * Adding to a runqueue. The dynamic priority queue that it is added to is ++ * determined by recalc_task_prio() above. ++ */ ++static inline void __enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ if (rt_task(p)) ++ p->array = rq->active; ++ else ++ recalc_task_prio(p, rq); ++ ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio++; ++ sched_info_queued(p); ++ set_dynamic_bit(p, rq); ++} ++ ++static void enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add_tail(&p->run_list, p->array->queue + p->prio); ++} ++ ++static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add(&p->run_list, p->array->queue + p->prio); ++} ++ ++/* ++ * requeue_task is only called when p->static_prio does not change. p->prio ++ * can change with dynamic tasks. ++ */ ++static void requeue_task(struct task_struct *p, struct rq *rq, ++ struct prio_array *old_array, int old_prio) ++{ ++ if (p->array == rq->expired) ++ queue_expired(p, rq); ++ list_move_tail(&p->run_list, p->array->queue + p->prio); ++ if (!rt_task(p)) { ++ if (list_empty(old_array->queue + old_prio)) ++ __clear_bit(old_prio, old_array->prio_bitmap); ++ set_dynamic_bit(p, rq); ++ } + } + + /* +@@ -759,20 +964,29 @@ static inline int __normal_prio(struct t + */ + + /* +- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE +- * If static_prio_timeslice() is ever changed to break this assumption then +- * this code will need modification +- */ +-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +-#define LOAD_WEIGHT(lp) \ +- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +-#define PRIO_TO_LOAD_WEIGHT(prio) \ +- LOAD_WEIGHT(static_prio_timeslice(prio)) +-#define RTPRIO_TO_LOAD_WEIGHT(rp) \ +- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) ++ * task_timeslice - the total duration a task can run during one major ++ * rotation. Returns value in milliseconds as the smallest value can be 1. ++ */ ++static int task_timeslice(struct task_struct *p) ++{ ++ int slice = p->quota; /* quota is in us */ ++ ++ if (!rt_task(p)) ++ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; ++ return US_TO_MS(slice); ++} ++ ++/* ++ * The load weight is basically the task_timeslice in ms. Realtime tasks are ++ * special cased to be proportionately larger than nice -20 by their ++ * rt_priority. The weight for rt tasks can only be arbitrary at best. ++ */ ++#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) + + static void set_load_weight(struct task_struct *p) + { ++ int load_weight; ++ + if (has_rt_policy(p)) { + #ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) +@@ -781,12 +995,19 @@ static void set_load_weight(struct task_ + * Giving its load any weight will skew balancing + * adversely. + */ +- p->load_weight = 0; ++ load_weight = 0; + else + #endif +- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); ++ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else +- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); ++ load_weight = task_timeslice(p); ++ /* ++ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but ++ * still need to be weighted to allow balancing to occur. ++ */ ++ if (likely(!idleprio_task(p))) ++ load_weight *= PRIO_RANGE; ++ p->load_weight = load_weight; + } + + static inline void +@@ -814,28 +1035,38 @@ static inline void dec_nr_running(struct + } + + /* +- * Calculate the expected normal priority: i.e. priority +- * without taking RT-inheritance into account. Might be +- * boosted by interactivity modifiers. Changes upon fork, +- * setprio syscalls, and whenever the interactivity +- * estimator recalculates. ++ * __activate_task - move a task to the runqueue. + */ +-static inline int normal_prio(struct task_struct *p) ++static inline void __activate_task(struct task_struct *p, struct rq *rq) + { +- int prio; ++ enqueue_task(p, rq); ++ inc_nr_running(p, rq); ++} + ++/* ++ * __activate_idle_task - move idle task to the _front_ of runqueue. ++ */ ++static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task_head(p, rq); ++ inc_nr_running(p, rq); ++} ++ ++static inline int normal_prio(struct task_struct *p) ++{ + if (has_rt_policy(p)) +- prio = MAX_RT_PRIO-1 - p->rt_priority; ++ return MAX_RT_PRIO-1 - p->rt_priority; ++ /* Other tasks all have normal_prio set in recalc_task_prio */ ++ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) ++ return p->prio; + else +- prio = __normal_prio(p); +- return prio; ++ return p->static_prio; + } + + /* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might +- * be boosted by RT tasks, or might be boosted by +- * interactivity modifiers. Will be RT if the task got ++ * be boosted by RT tasks as it will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ + static int effective_prio(struct task_struct *p) +@@ -852,111 +1083,41 @@ static int effective_prio(struct task_st + } + + /* +- * __activate_task - move a task to the runqueue. ++ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. ++ * From nice 1 to 19 they are smaller than it only if they are at least one ++ * tick still. Below nice 0 they get progressively larger. ++ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval ++ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. ++ * Value returned is in microseconds. + */ +-static void __activate_task(struct task_struct *p, struct rq *rq) ++static inline unsigned int rr_quota(struct task_struct *p) + { +- struct prio_array *target = rq->active; ++ int nice = TASK_NICE(p), rr = rr_interval; + +- if (batch_task(p)) +- target = rq->expired; +- enqueue_task(p, target); +- inc_nr_running(p, rq); +-} +- +-/* +- * __activate_idle_task - move idle task to the _front_ of runqueue. +- */ +-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +-{ +- enqueue_task_head(p, rq->active); +- inc_nr_running(p, rq); ++ if (!rt_task(p)) { ++ if (nice < -6) { ++ rr *= nice * nice; ++ rr /= 40; ++ } else if (nice > 0) ++ rr = rr / 2 ? : 1; ++ } ++ return MS_TO_US(rr); + } + +-/* +- * Recalculate p->normal_prio and p->prio after having slept, +- * updating the sleep-average too: +- */ +-static int recalc_task_prio(struct task_struct *p, unsigned long long now) ++/* Every time we set the quota we need to set the load weight */ ++static void set_quota(struct task_struct *p) + { +- /* Caller must always ensure 'now >= p->timestamp' */ +- unsigned long sleep_time = now - p->timestamp; +- +- if (batch_task(p)) +- sleep_time = 0; +- +- if (likely(sleep_time > 0)) { +- /* +- * This ceiling is set to the lowest priority that would allow +- * a task to be reinserted into the active array on timeslice +- * completion. +- */ +- unsigned long ceiling = INTERACTIVE_SLEEP(p); +- +- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { +- /* +- * Prevents user tasks from achieving best priority +- * with one single large enough sleep. +- */ +- p->sleep_avg = ceiling; +- /* +- * Using INTERACTIVE_SLEEP() as a ceiling places a +- * nice(0) task 1ms sleep away from promotion, and +- * gives it 700ms to round-robin with no chance of +- * being demoted. This is more than generous, so +- * mark this sleep as non-interactive to prevent the +- * on-runqueue bonus logic from intervening should +- * this task not receive cpu immediately. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else { +- /* +- * Tasks waking from uninterruptible sleep are +- * limited in their sleep_avg rise as they +- * are likely to be waiting on I/O +- */ +- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { +- if (p->sleep_avg >= ceiling) +- sleep_time = 0; +- else if (p->sleep_avg + sleep_time >= +- ceiling) { +- p->sleep_avg = ceiling; +- sleep_time = 0; +- } +- } +- +- /* +- * This code gives a bonus to interactive tasks. +- * +- * The boost works by updating the 'average sleep time' +- * value here, based on ->timestamp. The more time a +- * task spends sleeping, the higher the average gets - +- * and the higher the priority boost gets as well. +- */ +- p->sleep_avg += sleep_time; +- +- } +- if (p->sleep_avg > NS_MAX_SLEEP_AVG) +- p->sleep_avg = NS_MAX_SLEEP_AVG; +- } +- +- return effective_prio(p); ++ p->quota = rr_quota(p); ++ set_load_weight(p); + } + + /* + * activate_task - move a task to the runqueue and do priority recalculation +- * +- * Update all the scheduling statistics stuff. (sleep average +- * calculation, priority modifiers, etc.) + */ + static void activate_task(struct task_struct *p, struct rq *rq, int local) + { +- unsigned long long now; +- +- if (rt_task(p)) +- goto out; ++ unsigned long long now = sched_clock(); + +- now = sched_clock(); + #ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ +@@ -977,32 +1138,9 @@ static void activate_task(struct task_st + (now - p->timestamp) >> 20); + } + +- p->prio = recalc_task_prio(p, now); +- +- /* +- * This checks to make sure it's not an uninterruptible task +- * that is now waking up. +- */ +- if (p->sleep_type == SLEEP_NORMAL) { +- /* +- * Tasks which were woken up by interrupts (ie. hw events) +- * are most likely of interactive nature. So we give them +- * the credit of extending their sleep time to the period +- * of time they spend on the runqueue, waiting for execution +- * on a CPU, first time around: +- */ +- if (in_interrupt()) +- p->sleep_type = SLEEP_INTERRUPTED; +- else { +- /* +- * Normal first-time wakeups get a credit too for +- * on-runqueue time, but it will be weighted down: +- */ +- p->sleep_type = SLEEP_INTERACTIVE; +- } +- } ++ set_quota(p); ++ p->prio = effective_prio(p); + p->timestamp = now; +-out: + __activate_task(p, rq); + } + +@@ -1012,8 +1150,7 @@ out: + static void deactivate_task(struct task_struct *p, struct rq *rq) + { + dec_nr_running(p, rq); +- dequeue_task(p, p->array); +- p->array = NULL; ++ dequeue_task(p, rq); + } + + /* +@@ -1095,7 +1232,7 @@ migrate_task(struct task_struct *p, int + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ +- if (!p->array && !task_running(rq, p)) { ++ if (!task_queued(p) && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } +@@ -1126,7 +1263,7 @@ void wait_task_inactive(struct task_stru + repeat: + rq = task_rq_lock(p, &flags); + /* Must be off runqueue entirely, not preempted. */ +- if (unlikely(p->array || task_running(rq, p))) { ++ if (unlikely(task_queued(p) || task_running(rq, p))) { + /* If it's preempted, we yield. It could be a while. */ + preempted = !task_running(rq, p); + task_rq_unlock(rq, &flags); +@@ -1391,6 +1528,31 @@ static inline int wake_idle(int cpu, str + } + #endif + ++/* ++ * We need to have a special definition for an idle runqueue when testing ++ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as ++ * a realtime task in sched_idle_next. ++ */ ++#ifdef CONFIG_HOTPLUG_CPU ++#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) ++#else ++#define rq_idle(rq) ((rq)->curr == (rq)->idle) ++#endif ++ ++static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ return ((p->array == task_rq(p)->active && ++ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); ++} ++ ++static inline void try_preempt(struct task_struct *p, struct rq *rq) ++{ ++ if (task_preempts_curr(p, rq)) ++ resched_task(rq->curr); ++} ++ + /*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread +@@ -1422,7 +1584,7 @@ static int try_to_wake_up(struct task_st + if (!(old_state & state)) + goto out; + +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + cpu = task_cpu(p); +@@ -1515,7 +1677,7 @@ out_set_cpu: + old_state = p->state; + if (!(old_state & state)) + goto out; +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + this_cpu = smp_processor_id(); +@@ -1524,25 +1686,9 @@ out_set_cpu: + + out_activate: + #endif /* CONFIG_SMP */ +- if (old_state == TASK_UNINTERRUPTIBLE) { ++ if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; +- /* +- * Tasks on involuntary sleep don't earn +- * sleep_avg beyond just interactive state. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else +- +- /* +- * Tasks that have marked their sleep as noninteractive get +- * woken up with their sleep average not weighted in an +- * interactive way. +- */ +- if (old_state & TASK_NONINTERACTIVE) +- p->sleep_type = SLEEP_NONINTERACTIVE; + +- +- activate_task(p, rq, cpu == this_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) +@@ -1551,15 +1697,22 @@ out_activate: + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ +- if (!sync || cpu != this_cpu) { +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); +- } ++ activate_task(p, rq, cpu == this_cpu); ++ if (!sync || cpu != this_cpu) ++ try_preempt(p, rq); + success = 1; + + out_running: + p->state = TASK_RUNNING; + out: ++ /* ++ * Special case when freezing we need to reschedule idleprio tasks ++ * as SCHED_NORMAL or else they'll never freeze ++ */ ++ if (idleprio_task(p) && freezing(p) && idleprio(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ } + task_rq_unlock(rq, &flags); + + return success; +@@ -1577,7 +1730,6 @@ int fastcall wake_up_state(struct task_s + return try_to_wake_up(p, state, 0); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p); + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -1605,7 +1757,6 @@ void fastcall sched_fork(struct task_str + p->prio = current->normal_prio; + + INIT_LIST_HEAD(&p->run_list); +- p->array = NULL; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +@@ -1617,30 +1768,31 @@ void fastcall sched_fork(struct task_str + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; + #endif ++ if (unlikely(p->policy == SCHED_FIFO)) ++ goto out; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); +- p->time_slice = (current->time_slice + 1) >> 1; +- /* +- * The remainder of the first timeslice might be recovered by +- * the parent if the child exits early enough. +- */ +- p->first_time_slice = 1; +- current->time_slice >>= 1; +- p->timestamp = sched_clock(); +- if (unlikely(!current->time_slice)) { ++ if (current->time_slice > 0) { ++ current->time_slice /= 2; ++ if (current->time_slice) ++ p->time_slice = current->time_slice; ++ else ++ p->time_slice = 1; + /* +- * This case is rare, it happens when the parent has only +- * a single jiffy left from its timeslice. Taking the +- * runqueue lock is not a problem. ++ * The remainder of the first timeslice might be recovered by ++ * the parent if the child exits early enough. + */ +- current->time_slice = 1; +- task_running_tick(cpu_rq(cpu), current); +- } ++ p->first_time_slice = 1; ++ } else ++ p->time_slice = 0; ++ ++ p->timestamp = sched_clock(); + local_irq_enable(); ++out: + put_cpu(); + } + +@@ -1662,38 +1814,16 @@ void fastcall wake_up_new_task(struct ta + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +- /* +- * We decrease the sleep average of forking parents +- * and children as well, to keep max-interactive tasks +- * from forking tasks that are max-interactive. The parent +- * (current) is done further down, under its lock. +- */ +- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * +- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); +- +- p->prio = effective_prio(p); +- + if (likely(cpu == this_cpu)) { ++ activate_task(p, rq, 1); + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ +- if (unlikely(!current->array)) +- __activate_task(p, rq); +- else { +- p->prio = current->prio; +- p->normal_prio = current->normal_prio; +- list_add_tail(&p->run_list, ¤t->run_list); +- p->array = current->array; +- p->array->nr_active++; +- inc_nr_running(p, rq); +- } + set_need_resched(); +- } else +- /* Run child last */ +- __activate_task(p, rq); ++ } + /* + * We skip the following code due to cpu == this_cpu + * +@@ -1710,19 +1840,16 @@ void fastcall wake_up_new_task(struct ta + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; +- __activate_task(p, rq); +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ activate_task(p, rq, 0); ++ try_preempt(p, rq); + + /* + * Parent and child are on different CPUs, now get the +- * parent runqueue to update the parent's ->sleep_avg: ++ * parent runqueue to update the parent's ->flags: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } +- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * +- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); + } + +@@ -1737,23 +1864,17 @@ void fastcall wake_up_new_task(struct ta + */ + void fastcall sched_exit(struct task_struct *p) + { ++ struct task_struct *parent; + unsigned long flags; + struct rq *rq; + +- /* +- * If the child was a (relative-) CPU hog then decrease +- * the sleep_avg of the parent as well. +- */ +- rq = task_rq_lock(p->parent, &flags); +- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { +- p->parent->time_slice += p->time_slice; +- if (unlikely(p->parent->time_slice > task_timeslice(p))) +- p->parent->time_slice = task_timeslice(p); +- } +- if (p->sleep_avg < p->parent->sleep_avg) +- p->parent->sleep_avg = p->parent->sleep_avg / +- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / +- (EXIT_WEIGHT + 1); ++ parent = p->parent; ++ rq = task_rq_lock(parent, &flags); ++ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { ++ parent->time_slice += p->time_slice; ++ if (unlikely(parent->time_slice > parent->quota)) ++ parent->time_slice = parent->quota; ++ } + task_rq_unlock(rq, &flags); + } + +@@ -2085,23 +2206,17 @@ void sched_exec(void) + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +-static void pull_task(struct rq *src_rq, struct prio_array *src_array, +- struct task_struct *p, struct rq *this_rq, +- struct prio_array *this_array, int this_cpu) ++static void pull_task(struct rq *src_rq, struct task_struct *p, ++ struct rq *this_rq, int this_cpu) + { +- dequeue_task(p, src_array); ++ dequeue_task(p, src_rq); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); +- enqueue_task(p, this_array); ++ enqueue_task(p, this_rq); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; +- /* +- * Note that idle threads have a prio of MAX_PRIO, for this test +- * to be always true for them. +- */ +- if (TASK_PREEMPTS_CURR(p, this_rq)) +- resched_task(this_rq->curr); ++ try_preempt(p, this_rq); + } + + /* +@@ -2144,7 +2259,16 @@ int can_migrate_task(struct task_struct + return 1; + } + +-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) ++static inline int rq_best_prio(struct rq *rq) ++{ ++ int best_prio, exp_prio; ++ ++ best_prio = sched_find_first_bit(rq->dyn_bitmap); ++ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ if (unlikely(best_prio > exp_prio)) ++ best_prio = exp_prio; ++ return best_prio; ++} + + /* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted +@@ -2160,7 +2284,7 @@ static int move_tasks(struct rq *this_rq + { + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; +- struct prio_array *array, *dst_array; ++ struct prio_array *array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; +@@ -2187,31 +2311,29 @@ static int move_tasks(struct rq *this_rq + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ +- if (busiest->expired->nr_active) { +- array = busiest->expired; +- dst_array = this_rq->expired; +- } else { +- array = busiest->active; +- dst_array = this_rq->active; +- } +- ++ array = busiest->expired; + new_array: +- /* Start searching at priority 0: */ +- idx = 0; ++ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ ++ if (array == busiest->expired) ++ idx = MAX_RT_PRIO; ++ else ++ idx = 0; + skip_bitmap: + if (!idx) +- idx = sched_find_first_bit(array->bitmap); ++ idx = sched_find_first_bit(array->prio_bitmap); + else +- idx = find_next_bit(array->bitmap, MAX_PRIO, idx); +- if (idx >= MAX_PRIO) { +- if (array == busiest->expired && busiest->active->nr_active) { ++ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); ++ if (idx == MAX_PRIO) { ++ if (array == busiest->idleprio && busiest->nr_idleprio) ++ goto found_idleprio; ++ if (array == busiest->expired) { + array = busiest->active; +- dst_array = this_rq->active; + goto new_array; + } + goto out; + } + ++found_idleprio: + head = array->queue + idx; + curr = head->prev; + skip_queue: +@@ -2233,11 +2355,22 @@ skip_queue: + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ /* ++ * Occurs either when balancing idleprio tasks or ++ * there really are no more tasks to find. ++ */ ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } + +- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); ++ pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + rem_load_move -= tmp->load_weight; + +@@ -2250,6 +2383,13 @@ skip_queue: + this_best_prio = idx; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } +@@ -3013,11 +3153,36 @@ EXPORT_PER_CPU_SYMBOL(kstat); + /* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ * The value returned from sched_clock() occasionally gives bogus values so ++ * some sanity checking is required. + */ +-static inline void +-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) ++static void ++update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, ++ int tick) + { +- p->sched_time += now - p->last_ran; ++ long time_diff = now - p->last_ran; ++ ++ if (tick) { ++ /* ++ * Called from scheduler_tick() there should be less than two ++ * jiffies worth, and not negative/overflow. ++ */ ++ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) ++ time_diff = JIFFIES_TO_NS(1); ++ } else { ++ /* ++ * Called from context_switch there should be less than one ++ * jiffy worth, and not negative/overflow. There should be ++ * some time banked here so use a nominal 1us. ++ */ ++ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) ++ time_diff = 1000; ++ } ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p != rq->idle && p->policy != SCHED_FIFO) ++ p->time_slice -= time_diff / 1000; ++ p->sched_time += time_diff; + p->last_ran = rq->most_recent_timestamp = now; + } + +@@ -3038,27 +3203,6 @@ unsigned long long current_sched_time(co + } + + /* +- * We place interactive tasks back into the active array, if possible. +- * +- * To guarantee that this does not starve expired tasks we ignore the +- * interactivity of a task if the first expired task had to wait more +- * than a 'reasonable' amount of time. This deadline timeout is +- * load-dependent, as the frequency of array switched decreases with +- * increasing number of running tasks. We also ignore the interactivity +- * if a better static_prio task has expired: +- */ +-static inline int expired_starving(struct rq *rq) +-{ +- if (rq->curr->static_prio > rq->best_expired_prio) +- return 1; +- if (!STARVATION_LIMIT || !rq->expired_timestamp) +- return 0; +- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) +- return 1; +- return 0; +-} +- +-/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() +@@ -3073,7 +3217,7 @@ void account_user_time(struct task_struc + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0 || idleprio_task(p)) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +@@ -3131,87 +3275,94 @@ void account_steal_time(struct task_stru + cpustat->steal = cputime64_add(cpustat->steal, tmp); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p) ++/* ++ * The task has used up its quota of running in this prio_level so it must be ++ * dropped a priority level, all managed by recalc_task_prio(). ++ */ ++static void task_expired_entitlement(struct rq *rq, struct task_struct *p) + { +- if (p->array != rq->active) { +- /* Task has expired but was not scheduled yet */ +- set_tsk_need_resched(p); ++ int overrun; ++ ++ reset_first_time_slice(p); ++ if (rt_task(p)) { ++ p->time_slice += p->quota; ++ list_move_tail(&p->run_list, p->array->queue + p->prio); + return; + } +- spin_lock(&rq->lock); ++ overrun = p->time_slice; ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); + /* +- * The task was running during this tick - update the +- * time slice counter. Note: we do not update a thread's +- * priority until it either goes to sleep or uses up its +- * timeslice. This makes it possible for interactive tasks +- * to use up their timeslices at their highest priority levels. ++ * Subtract any extra time this task ran over its time_slice; ie ++ * overrun will either be 0 or negative. + */ +- if (rt_task(p)) { +- /* +- * RR tasks need a special form of timeslice management. +- * FIFO tasks have no timeslices. +- */ +- if ((p->policy == SCHED_RR) && !--p->time_slice) { +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; +- set_tsk_need_resched(p); ++ p->time_slice += overrun; ++} + +- /* put it at the end of the queue: */ +- requeue_task(p, rq->active); +- } +- goto out_unlock; ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. ++ */ ++static unsigned int test_ret_isorefractory(struct rq *rq) ++{ ++ if (likely(!rq->iso_refractory)) { ++ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) ++ rq->iso_refractory = 1; ++ } else { ++ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) ++ rq->iso_refractory = 0; + } +- if (!--p->time_slice) { +- dequeue_task(p, rq->active); +- set_tsk_need_resched(p); +- p->prio = effective_prio(p); +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; ++ return rq->iso_refractory; ++} + +- if (!rq->expired_timestamp) +- rq->expired_timestamp = jiffies; +- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { +- enqueue_task(p, rq->expired); +- if (p->static_prio < rq->best_expired_prio) +- rq->best_expired_prio = p->static_prio; +- } else +- enqueue_task(p, rq->active); +- } else { +- /* +- * Prevent a too long timeslice allowing a task to monopolize +- * the CPU. We do this by splitting up the timeslice into +- * smaller pieces. +- * +- * Note: this does not mean the task's timeslices expire or +- * get lost in any way, they just might be preempted by +- * another task of equal priority. (one with higher +- * priority would have preempted this task already.) We +- * requeue this task to the end of the list on this priority +- * level, which is in essence a round-robin of tasks with +- * equal priority. +- * +- * This only applies to tasks in the interactive +- * delta range with at least TIMESLICE_GRANULARITY to requeue. +- */ +- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - +- p->time_slice) % TIMESLICE_GRANULARITY(p)) && +- (p->time_slice >= TIMESLICE_GRANULARITY(p)) && +- (p->array == rq->active)) { ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++} + +- requeue_task(p, rq->active); +- set_tsk_need_resched(p); +- } ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { ++ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) ++ rq->iso_ticks += 100; ++ } else ++ no_iso_tick(rq); ++ ++ if (iso_task(p)) { ++ if (unlikely(test_ret_isorefractory(rq))) { ++ if (isoprio_suitable(p)) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Set the PF_ISOREF flag and ++ * force it to reschedule as SCHED_NORMAL ++ * by zeroing its time_slice ++ */ ++ p->flags |= PF_ISOREF; ++ p->time_slice = 0; ++ } ++ } else ++ p->flags &= ~PF_ISOREF; + } +-out_unlock: +- spin_unlock(&rq->lock); ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->time_slice > 0 || p->policy == SCHED_FIFO) ++ return; ++ /* p->time_slice <= 0 */ ++ set_tsk_need_resched(p); ++ if (likely(task_queued(p))) ++ task_expired_entitlement(rq, p); + } + + /* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. +- * +- * It also gets called by the fork code, when changing the parent's +- * timeslices. + */ + void scheduler_tick(void) + { +@@ -3220,10 +3371,14 @@ void scheduler_tick(void) + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + +- update_cpu_clock(p, rq, now); ++ update_cpu_clock(p, rq, now, 1); + ++ spin_lock(&rq->lock); + if (p != rq->idle) + task_running_tick(rq, p); ++ else ++ no_iso_tick(rq); ++ spin_unlock(&rq->lock); + #ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) +@@ -3269,10 +3424,80 @@ EXPORT_SYMBOL(sub_preempt_count); + + #endif + +-static inline int interactive_sleep(enum sleep_type sleep_type) ++static void reset_prio_levels(struct rq *rq) ++{ ++ rq->active->best_static_prio = MAX_PRIO - 1; ++ rq->expired->best_static_prio = MAX_PRIO - 1; ++ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); ++} ++ ++/* ++ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the ++ * idleprio array and if it isn't already active ++ */ ++static struct task_struct *next_idleprio_task(struct rq *rq) + { +- return (sleep_type == SLEEP_INTERACTIVE || +- sleep_type == SLEEP_INTERRUPTED); ++ struct prio_array *array = rq->active; ++ struct list_head *queue; ++ ++ if (array != rq->idleprio) { ++ rq->active = rq->idleprio; ++ rq->expired = array; ++ array = rq->active; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ } ++ rq->prio_rotation++; ++ reset_prio_levels(rq); ++ queue = array->queue + MAX_PRIO; ++ return list_entry(queue->next, struct task_struct, run_list); ++} ++ ++/* ++ * next_dynamic_task finds the next suitable dynamic task. ++ */ ++static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) ++{ ++ struct prio_array *array = rq->active; ++ struct task_struct *next; ++ struct list_head *queue; ++ int nstatic; ++ ++retry: ++ if (unlikely(rq->nr_running == rq->nr_idleprio)) ++ return next_idleprio_task(rq); ++ if (idx >= MAX_PRIO) { ++ /* There are no more tasks in the active array. Swap arrays */ ++ array = rq->expired; ++ rq->expired = rq->active; ++ rq->active = array; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->prio_rotation++; ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ reset_prio_levels(rq); ++ } ++ queue = array->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); ++ if (unlikely(next->time_slice <= 0 && !(iso_task(next) && ++ isoprio_suitable(next)))) { ++ /* ++ * Unlucky enough that this task ran out of time_slice ++ * before it hit a scheduler_tick so it should have its ++ * priority reassessed and choose another task (possibly ++ * the same one) ++ */ ++ task_expired_entitlement(rq, next); ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ goto retry; ++ } ++ next->rotation = rq->prio_rotation; ++ nstatic = next->static_prio; ++ if (nstatic < array->best_static_prio) ++ array->best_static_prio = nstatic; ++ if (idx > rq->prio_level[USER_PRIO(nstatic)]) ++ rq->prio_level[USER_PRIO(nstatic)] = idx; ++ return next; + } + + /* +@@ -3281,13 +3506,11 @@ static inline int interactive_sleep(enum + asmlinkage void __sched schedule(void) + { + struct task_struct *prev, *next; +- struct prio_array *array; + struct list_head *queue; + unsigned long long now; +- unsigned long run_time; +- int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; ++ int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into +@@ -3323,18 +3546,6 @@ need_resched_nonpreemptible: + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); +- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { +- run_time = now - prev->timestamp; +- if (unlikely((long long)(now - prev->timestamp) < 0)) +- run_time = 0; +- } else +- run_time = NS_MAX_SLEEP_AVG; +- +- /* +- * Tasks charged proportionately less run_time at high sleep_avg to +- * delay them losing their interactive status +- */ +- run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + +@@ -3345,8 +3556,10 @@ need_resched_nonpreemptible: + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { +- if (prev->state == TASK_UNINTERRUPTIBLE) ++ if (prev->state == TASK_UNINTERRUPTIBLE) { ++ prev->flags |= PF_NONSLEEP; + rq->nr_uninterruptible++; ++ } + deactivate_task(prev, rq); + } + } +@@ -3356,59 +3569,29 @@ need_resched_nonpreemptible: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; +- rq->expired_timestamp = 0; + goto switch_tasks; + } + } + +- array = rq->active; +- if (unlikely(!array->nr_active)) { +- /* +- * Switch the active and expired arrays. +- */ +- schedstat_inc(rq, sched_switch); +- rq->active = rq->expired; +- rq->expired = array; +- array = rq->active; +- rq->expired_timestamp = 0; +- rq->best_expired_prio = MAX_PRIO; ++ idx = sched_find_first_bit(rq->dyn_bitmap); ++ if (likely(idx > ISO_PRIO)) ++ next = next_dynamic_task(rq, idx); ++ else { ++ queue = rq->active->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); + } +- +- idx = sched_find_first_bit(array->bitmap); +- queue = array->queue + idx; +- next = list_entry(queue->next, struct task_struct, run_list); +- +- if (!rt_task(next) && interactive_sleep(next->sleep_type)) { +- unsigned long long delta = now - next->timestamp; +- if (unlikely((long long)(now - next->timestamp) < 0)) +- delta = 0; +- +- if (next->sleep_type == SLEEP_INTERACTIVE) +- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; +- +- array = next->array; +- new_prio = recalc_task_prio(next, next->timestamp + delta); +- +- if (unlikely(next->prio != new_prio)) { +- dequeue_task(next, array); +- next->prio = new_prio; +- enqueue_task(next, array); +- } +- } +- next->sleep_type = SLEEP_NORMAL; + switch_tasks: +- if (next == rq->idle) ++ if (next == rq->idle) { ++ reset_prio_levels(rq); ++ rq->prio_rotation++; + schedstat_inc(rq, sched_goidle); ++ } + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + +- update_cpu_clock(prev, rq, now); +- +- prev->sleep_avg -= run_time; +- if ((long)prev->sleep_avg <= 0) +- prev->sleep_avg = 0; ++ update_cpu_clock(prev, rq, now, 0); + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); +@@ -3844,29 +4027,22 @@ EXPORT_SYMBOL(sleep_on_timeout); + */ + void rt_mutex_setprio(struct task_struct *p, int prio) + { +- struct prio_array *array; + unsigned long flags; ++ int queued, oldprio; + struct rq *rq; +- int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; +- array = p->array; +- if (array) +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p, rq); + p->prio = prio; + +- if (array) { +- /* +- * If changing to an RT priority then queue it +- * in the active array! +- */ +- if (rt_task(p)) +- array = rq->active; +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on +@@ -3875,8 +4051,8 @@ void rt_mutex_setprio(struct task_struct + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + task_rq_unlock(rq, &flags); + } +@@ -3885,8 +4061,7 @@ void rt_mutex_setprio(struct task_struct + + void set_user_nice(struct task_struct *p, long nice) + { +- struct prio_array *array; +- int old_prio, delta; ++ int queued, old_prio,delta; + unsigned long flags; + struct rq *rq; + +@@ -3907,26 +4082,27 @@ void set_user_nice(struct task_struct *p + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } +- array = p->array; +- if (array) { +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) { ++ dequeue_task(p, rq); + dec_raw_weighted_load(rq, p); + } + + p->static_prio = NICE_TO_PRIO(nice); +- set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); ++ set_quota(p); + delta = p->prio - old_prio; + +- if (array) { +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + inc_raw_weighted_load(rq, p); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ +- if (delta < 0 || (delta > 0 && task_running(rq, p))) ++ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && ++ task_running(rq, p))) + resched_task(rq->curr); + } + out_unlock: +@@ -3996,7 +4172,7 @@ asmlinkage long sys_nice(int increment) + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered +- * around 0, value goes from -16 to +15. ++ * around 0, value goes from 0 to +39. + */ + int task_prio(const struct task_struct *p) + { +@@ -4043,19 +4219,14 @@ static inline struct task_struct *find_p + /* Actually do priority change: must hold rq lock. */ + static void __setscheduler(struct task_struct *p, int policy, int prio) + { +- BUG_ON(p->array); ++ BUG_ON(task_queued(p)); + + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); +- /* +- * SCHED_BATCH tasks are treated as perpetual CPU hogs: +- */ +- if (policy == SCHED_BATCH) +- p->sleep_avg = 0; +- set_load_weight(p); ++ set_quota(p); + } + + /** +@@ -4069,19 +4240,27 @@ static void __setscheduler(struct task_s + int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) + { +- int retval, oldprio, oldpolicy = -1; +- struct prio_array *array; ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ int queued, retval, oldprio, oldpolicy = -1; + unsigned long flags; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } + recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; +- else if (policy != SCHED_FIFO && policy != SCHED_RR && +- policy != SCHED_NORMAL && policy != SCHED_BATCH) ++ else if (!SCHED_RANGE(policy)) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are +@@ -4116,6 +4295,31 @@ recheck: + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy == SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } + } + + /* can't change other user's priorities */ +@@ -4124,6 +4328,11 @@ recheck: + return -EPERM; + } + ++ if (!(p->mm) && policy == SCHED_IDLEPRIO) { ++ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ ++ return -EINVAL; ++ } ++ + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; +@@ -4144,12 +4353,12 @@ recheck: + spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); +- if (array) { ++ if (queued) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and +@@ -4159,14 +4368,15 @@ recheck: + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + ++out: + return 0; + } + EXPORT_SYMBOL_GPL(sched_setscheduler); +@@ -4433,41 +4643,34 @@ asmlinkage long sys_sched_getaffinity(pi + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by moving the calling thread +- * to the expired array. If there are no other threads running on this +- * CPU then this function will return. ++ * to the expired array if SCHED_NORMAL or the end of its current priority ++ * queue if a realtime task. If there are no other threads running on this ++ * cpu this function will return. + */ + asmlinkage long sys_sched_yield(void) + { + struct rq *rq = this_rq_lock(); +- struct prio_array *array = current->array, *target = rq->expired; ++ struct task_struct *p = current; + + schedstat_inc(rq, yld_cnt); +- /* +- * We implement yielding by moving the task into the expired +- * queue. +- * +- * (special rule: RT tasks will just roundrobin in the active +- * array.) +- */ +- if (rt_task(current)) +- target = rq->active; +- +- if (array->nr_active == 1) { +- schedstat_inc(rq, yld_act_empty); +- if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_both_empty); +- } else if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_exp_empty); +- +- if (array != target) { +- dequeue_task(current, array); +- enqueue_task(current, target); +- } else +- /* +- * requeue_task is cheaper so perform that if possible. +- */ +- requeue_task(current, array); ++ if (rq->nr_running == 1) ++ schedstat_inc(rq, yld_both_empty); ++ else { ++ struct prio_array *old_array = p->array; ++ int old_prio = p->prio; ++ ++ if (idleprio_task(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ goto out_release; ++ } ++ /* p->prio will be updated in requeue_task via queue_expired */ ++ if (!rt_task(p)) ++ p->array = rq->expired; ++ requeue_task(p, rq, old_array, old_prio); ++ } + ++out_release: + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: +@@ -4619,6 +4822,8 @@ asmlinkage long sys_sched_get_priority_m + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + break; + } +@@ -4643,6 +4848,8 @@ asmlinkage long sys_sched_get_priority_m + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + } + return ret; +@@ -4676,8 +4883,8 @@ long sys_sched_rr_get_interval(pid_t pid + if (retval) + goto out_unlock; + +- jiffies_to_timespec(p->policy == SCHED_FIFO ? +- 0 : task_timeslice(p), &t); ++ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : ++ MS_TO_NS(task_timeslice(p))); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + out_nounlock: +@@ -4771,10 +4978,10 @@ void __cpuinit init_idle(struct task_str + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + +- idle->timestamp = sched_clock(); +- idle->sleep_avg = 0; +- idle->array = NULL; +- idle->prio = idle->normal_prio = MAX_PRIO; ++ bitmap_zero(idle->bitmap, PRIO_RANGE); ++ idle->timestamp = idle->last_ran = sched_clock(); ++ idle->array = rq->active; ++ idle->prio = idle->normal_prio = NICE_TO_PRIO(0); + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_cpu(idle, cpu); +@@ -4893,7 +5100,7 @@ static int __migrate_task(struct task_st + goto out; + + set_task_cpu(p, dest_cpu); +- if (p->array) { ++ if (task_queued(p)) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step +@@ -4904,8 +5111,7 @@ static int __migrate_task(struct task_st + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + __activate_task(p, rq_dest); +- if (TASK_PREEMPTS_CURR(p, rq_dest)) +- resched_task(rq_dest->curr); ++ try_preempt(p, rq_dest); + } + ret = 1; + out: +@@ -5194,7 +5400,7 @@ migration_call(struct notifier_block *nf + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); +- rq->idle->static_prio = MAX_PRIO; ++ rq->idle->static_prio = NICE_TO_PRIO(0); + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); +@@ -6706,6 +6912,13 @@ void __init sched_init_smp(void) + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); ++ ++ /* ++ * Assume that every added cpu gives us slightly less overall latency ++ * allowing us to increase the base rr_interval, but in a non linear ++ * fashion. ++ */ ++ rr_interval *= 1 + ilog2(num_online_cpus()); + } + #else + void __init sched_init_smp(void) +@@ -6727,6 +6940,16 @@ void __init sched_init(void) + { + int i, j, k; + ++ /* Generate the priority matrix */ ++ for (i = 0; i < PRIO_RANGE; i++) { ++ bitmap_fill(prio_matrix[i], PRIO_RANGE); ++ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); ++ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { ++ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), ++ prio_matrix[i]); ++ } ++ } ++ + for_each_possible_cpu(i) { + struct prio_array *array; + struct rq *rq; +@@ -6734,12 +6957,20 @@ void __init sched_init(void) + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + lockdep_set_class(&rq->lock, &rq->rq_lock_key); ++ rq->iso_ticks = 0; + rq->nr_running = 0; ++ rq->nr_idleprio = 0; ++ rq->prio_rotation = 0; + rq->active = rq->arrays; ++ rq->idleprio = rq->active; + rq->expired = rq->arrays + 1; +- rq->best_expired_prio = MAX_PRIO; ++ reset_prio_levels(rq); ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->exp_bitmap = rq->expired->prio_bitmap; + + #ifdef CONFIG_SMP ++ rq->active->rq = rq; ++ rq->expired->rq = rq; + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; +@@ -6752,16 +6983,16 @@ void __init sched_init(void) + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { ++ + array = rq->arrays + j; +- for (k = 0; k < MAX_PRIO; k++) { ++ for (k = 0; k <= MAX_PRIO; k++) + INIT_LIST_HEAD(array->queue + k); +- __clear_bit(k, array->bitmap); +- } +- // delimiter for bitsearch +- __set_bit(MAX_PRIO, array->bitmap); ++ bitmap_zero(array->prio_bitmap, MAX_PRIO); ++ /* delimiter for bitsearch */ ++ __set_bit(MAX_PRIO, array->prio_bitmap); + } +- } + ++ } + set_load_weight(&init_task); + + #ifdef CONFIG_SMP +@@ -6815,24 +7046,24 @@ EXPORT_SYMBOL(__might_sleep); + #ifdef CONFIG_MAGIC_SYSRQ + void normalize_rt_tasks(void) + { +- struct prio_array *array; + struct task_struct *p; + unsigned long flags; + struct rq *rq; ++ int queued; + + read_lock_irq(&tasklist_lock); + for_each_process(p) { +- if (!rt_task(p)) ++ if (!rt_task(p) && !iso_task(p)) + continue; + + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); + +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); +- if (array) { ++ if (queued) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } +Index: linux-2.6.21-ck1/Documentation/sysctl/kernel.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/sysctl/kernel.txt 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/sysctl/kernel.txt 2007-05-04 12:10:55.000000000 +1000 +@@ -25,6 +25,9 @@ show up in /proc/sys/kernel: + - domainname + - hostname + - hotplug ++- interactive ++- iso_cpu ++- iso_period + - java-appletviewer [ binfmt_java, obsolete ] + - java-interpreter [ binfmt_java, obsolete ] + - kstack_depth_to_print [ X86 only ] +@@ -43,6 +46,7 @@ show up in /proc/sys/kernel: + - printk + - real-root-dev ==> Documentation/initrd.txt + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sem +@@ -164,6 +168,40 @@ Default value is "/sbin/hotplug". + + ============================================================== + ++interactive: ++ ++The staircase-deadline cpu scheduler can be set in either purely ++forward-looking mode for absolutely rigid fairness and cpu distribution ++according to nice level, or it can allow a small per-process history ++to smooth out cpu usage perturbations common in interactive tasks by ++enabling this sysctl. While small fairness issues can arise with this ++enabled, overall fairness is usually still strongly maintained and ++starvation is never possible. Enabling this can significantly smooth ++out 3d graphics and games. ++ ++Default value is 1 (enabled). ++ ++============================================================== ++ ++iso_cpu: ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling iso_period ++seconds. ++ ++Set to 80 (percent) by default. ++ ++============================================================== ++ ++iso_period: ++ ++This sets the number of seconds over which SCHED_ISO cpu usage is averaged ++to see if it exceeds its allocated cpu bandwidth. ++ ++Set to 5 (seconds) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -288,6 +326,19 @@ rebooting. ??? + + ============================================================== + ++rr_interval: ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. This value is in milliseconds and the default value chosen ++depends on the number of cpus available at scheduler initialisation ++with a minimum of 8. ++ ++Valid values are from 1-5000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +Index: linux-2.6.21-ck1/kernel/sysctl.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/sysctl.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/sysctl.c 2007-05-04 12:24:21.000000000 +1000 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -70,12 +71,17 @@ extern int suid_dumpable; + extern char core_pattern[]; + extern int pid_max; + extern int min_free_kbytes; ++extern int vm_tail_largefiles; + extern int printk_ratelimit_jiffies; + extern int printk_ratelimit_burst; + extern int pid_max_min, pid_max_max; + extern int sysctl_drop_caches; + extern int percpu_pagelist_fraction; + extern int compat_log; ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_iso_period; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ + static int maxolduid = 65535; +@@ -159,6 +165,14 @@ int sysctl_legacy_va_layout; + #endif + + ++/* Constants for minimum and maximum testing. ++ We use these as one-element integer vectors. */ ++static int __read_mostly zero; ++static int __read_mostly one = 1; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly five_thousand = 5000; ++ ++ + /* The default sysctl tables: */ + + static ctl_table root_table[] = { +@@ -499,6 +513,47 @@ static ctl_table kern_table[] = { + .mode = 0444, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &five_thousand, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_period", ++ .data = &sched_iso_period, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &one_hundred, ++ }, + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .ctl_name = KERN_UNKNOWN_NMI_PANIC, +@@ -607,12 +662,6 @@ static ctl_table kern_table[] = { + { .ctl_name = 0 } + }; + +-/* Constants for minimum and maximum testing in vm_table. +- We use these as one-element integer vectors. */ +-static int zero; +-static int one_hundred = 100; +- +- + static ctl_table vm_table[] = { + { + .ctl_name = VM_OVERCOMMIT_MEMORY, +@@ -693,16 +742,32 @@ static ctl_table vm_table[] = { + .proc_handler = &proc_dointvec, + }, + { +- .ctl_name = VM_SWAPPINESS, +- .procname = "swappiness", +- .data = &vm_swappiness, +- .maxlen = sizeof(vm_swappiness), ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "mapped", ++ .data = &vm_mapped, ++ .maxlen = sizeof(vm_mapped), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "hardmaplimit", ++ .data = &vm_hardmaplimit, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "tail_largefiles", ++ .data = &vm_tail_largefiles, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, + #ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, +@@ -859,6 +924,16 @@ static ctl_table vm_table[] = { + .extra1 = &zero, + }, + #endif ++#ifdef CONFIG_SWAP_PREFETCH ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch", ++ .data = &swap_prefetch, ++ .maxlen = sizeof(swap_prefetch), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { .ctl_name = 0 } + }; + +Index: linux-2.6.21-ck1/fs/pipe.c +=================================================================== +--- linux-2.6.21-ck1.orig/fs/pipe.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/fs/pipe.c 2007-05-04 12:10:54.000000000 +1000 +@@ -41,12 +41,7 @@ void pipe_wait(struct pipe_inode_info *p + { + DEFINE_WAIT(wait); + +- /* +- * Pipes are system-local resources, so sleeping on them +- * is considered a noninteractive wait: +- */ +- prepare_to_wait(&pipe->wait, &wait, +- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); ++ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + schedule(); +Index: linux-2.6.21-ck1/Documentation/sched-design.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/sched-design.txt 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/sched-design.txt 2007-05-04 12:10:54.000000000 +1000 +@@ -1,11 +1,14 @@ +- Goals, Design and Implementation of the +- new ultra-scalable O(1) scheduler ++ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by ++ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by ++ Con Kolivas. + + +- This is an edited version of an email Ingo Molnar sent to +- lkml on 4 Jan 2002. It describes the goals, design, and +- implementation of Ingo's new ultra-scalable O(1) scheduler. +- Last Updated: 18 April 2002. ++ This was originally an edited version of an email Ingo Molnar sent to ++ lkml on 4 Jan 2002. It describes the goals, design, and implementation ++ of Ingo's ultra-scalable O(1) scheduler. It now contains a description ++ of the Staircase Deadline priority scheduler that was built on this ++ design. ++ Last Updated: Fri, 4 May 2007 + + + Goal +@@ -163,3 +166,222 @@ certain code paths and data constructs. + code is smaller than the old one. + + Ingo ++ ++ ++Staircase Deadline cpu scheduler policy ++================================================ ++ ++Design summary ++============== ++ ++A novel design which incorporates a foreground-background descending priority ++system (the staircase) via a bandwidth allocation matrix according to nice ++level. ++ ++ ++Features ++======== ++ ++A starvation free, strict fairness O(1) scalable design with interactivity ++as good as the above restrictions can provide. There is no interactivity ++estimator, no sleep/run measurements and only simple fixed accounting. ++The design has strict enough a design and accounting that task behaviour ++can be modelled and maximum scheduling latencies can be predicted by ++the virtual deadline mechanism that manages runqueues. The prime concern ++in this design is to maintain fairness at all costs determined by nice level, ++yet to maintain as good interactivity as can be allowed within the ++constraints of strict fairness. ++ ++ ++Design description ++================== ++ ++SD works off the principle of providing each task a quota of runtime that it is ++allowed to run at a number of priority levels determined by its static priority ++(ie. its nice level). If the task uses up its quota it has its priority ++decremented to the next level determined by a priority matrix. Once every ++runtime quota has been consumed of every priority level, a task is queued on the ++"expired" array. When no other tasks exist with quota, the expired array is ++activated and fresh quotas are handed out. This is all done in O(1). ++ ++Design details ++============== ++ ++Each task keeps a record of its own entitlement of cpu time. Most of the rest of ++these details apply to non-realtime tasks as rt task management is straight ++forward. ++ ++Each runqueue keeps a record of what major epoch it is up to in the ++rq->prio_rotation field which is incremented on each major epoch. It also ++keeps a record of the current prio_level for each static priority task. ++ ++Each task keeps a record of what major runqueue epoch it was last running ++on in p->rotation. It also keeps a record of what priority levels it has ++already been allocated quota from during this epoch in a bitmap p->bitmap. ++ ++The only tunable that determines all other details is the RR_INTERVAL. This ++is set to 8ms, and is scaled gently upwards with more cpus. This value is ++tunable via a /proc interface. ++ ++All tasks are initially given a quota based on RR_INTERVAL. This is equal to ++RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and ++progressively larger for nice values from -1 to -20. This is assigned to ++p->quota and only changes with changes in nice level. ++ ++As a task is first queued, it checks in recalc_task_prio to see if it has run at ++this runqueue's current priority rotation. If it has not, it will have its ++p->prio level set according to the first slot in a "priority matrix" and will be ++given a p->time_slice equal to the p->quota, and has its allocation bitmap bit ++set in p->bitmap for this prio level. It is then queued on the current active ++priority array. ++ ++If a task has already been running during this major epoch, and it has ++p->time_slice left and the rq->prio_quota for the task's p->prio still ++has quota, it will be placed back on the active array, but no more quota ++will be added. ++ ++If a task has been running during this major epoch, but does not have ++p->time_slice left, it will find the next lowest priority in its bitmap that it ++has not been allocated quota from. It then gets the a full quota in ++p->time_slice. It is then queued on the current active priority array at the ++newly determined lower priority. ++ ++If a task has been running during this major epoch, and does not have ++any entitlement left in p->bitmap and no time_slice left, it will have its ++bitmap cleared, and be queued at its best prio again, but on the expired ++priority array. ++ ++When a task is queued, it has its relevant bit set in the array->prio_bitmap. ++ ++p->time_slice is stored in nanosconds and is updated via update_cpu_clock on ++schedule() and scheduler_tick. If p->time_slice is below zero then the ++recalc_task_prio is readjusted and the task rescheduled. ++ ++ ++Priority Matrix ++=============== ++ ++In order to minimise the latencies between tasks of different nice levels ++running concurrently, the dynamic priority slots where different nice levels ++are queued are dithered instead of being sequential. What this means is that ++there are 40 priority slots where a task may run during one major rotation, ++and the allocation of slots is dependant on nice level. In the ++following table, a zero represents a slot where the task may run. ++ ++PRIORITY:0..................20.................39 ++nice -20 0000000000000000000000000000000000000000 ++nice -10 1000100010001000100010001000100010010000 ++nice 0 1010101010101010101010101010101010101010 ++nice 5 1011010110110101101101011011010110110110 ++nice 10 1110111011101110111011101110111011101110 ++nice 15 1111111011111110111111101111111011111110 ++nice 19 1111111111111111111111111111111111111110 ++ ++As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 ++task only runs one slot per major rotation. This dithered table allows for the ++smallest possible maximum latencies between tasks of varying nice levels, thus ++allowing vastly different nice levels to be used. ++ ++SCHED_BATCH tasks are managed slightly differently, receiving only the top ++slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but ++slightly higher latencies. ++ ++ ++Modelling deadline behaviour ++============================ ++ ++As the accounting in this design is hard and not modified by sleep average ++calculations or interactivity modifiers, it is possible to accurately ++predict the maximum latency that a task may experience under different ++conditions. This is a virtual deadline mechanism enforced by mandatory ++timeslice expiration and not outside bandwidth measurement. ++ ++The maximum duration a task can run during one major epoch is determined by its ++nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL ++duration during each epoch. Nice 10 tasks can run at 9 priority levels for each ++epoch, and so on. The table in the priority matrix above demonstrates how this ++is enforced. ++ ++Therefore the maximum duration a runqueue epoch can take is determined by ++the number of tasks running, and their nice level. After that, the maximum ++duration it can take before a task can wait before it get scheduled is ++determined by the position of its first slot on the matrix. ++ ++In the following examples, these are _worst case scenarios_ and would rarely ++occur, but can be modelled nonetheless to determine the maximum possible ++latency. ++ ++So for example, if two nice 0 tasks are running, and one has just expired as ++another is activated for the first time receiving a full quota for this ++runqueue rotation, the first task will wait: ++ ++nr_tasks * max_duration + nice_difference * rr_interval ++1 * 19 * RR_INTERVAL + 0 = 152ms ++ ++In the presence of a nice 10 task, a nice 0 task would wait a maximum of ++1 * 10 * RR_INTERVAL + 0 = 80ms ++ ++In the presence of a nice 0 task, a nice 10 task would wait a maximum of ++1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms ++ ++More useful than these values, though, are the average latencies which are ++a matter of determining the average distance between priority slots of ++different nice values and multiplying them by the tasks' quota. For example ++in the presence of a nice -10 task, a nice 0 task will wait either one or ++two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, ++this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or ++20 and 40ms respectively (on uniprocessor at 1000HZ). ++ ++ ++Achieving interactivity ++======================= ++ ++A requirement of this scheduler design was to achieve good interactivity ++despite being a completely fair deadline based design. The disadvantage of ++designs that try to achieve interactivity is that they usually do so at ++the expense of maintaining fairness. As cpu speeds increase, the requirement ++for some sort of metered unfairness towards interactive tasks becomes a less ++desirable phenomenon, but low latency and fairness remains mandatory to ++good interactive performance. ++ ++This design relies on the fact that interactive tasks, by their nature, ++sleep often. Most fair scheduling designs end up penalising such tasks ++indirectly giving them less than their fair possible share because of the ++sleep, and have to use a mechanism of bonusing their priority to offset ++this based on the duration they sleep. This becomes increasingly inaccurate ++as the number of running tasks rises and more tasks spend time waiting on ++runqueues rather than sleeping, and it is impossible to tell whether the ++task that's waiting on a runqueue only intends to run for a short period and ++then sleep again after than runqueue wait. Furthermore, all such designs rely ++on a period of time to pass to accumulate some form of statistic on the task ++before deciding on how much to give them preference. The shorter this period, ++the more rapidly bursts of cpu ruin the interactive tasks behaviour. The ++longer this period, the longer it takes for interactive tasks to get low ++scheduling latencies and fair cpu. ++ ++This design does not measure sleep time at all. Interactive tasks that sleep ++often will wake up having consumed very little if any of their quota for ++the current major priority rotation. The longer they have slept, the less ++likely they are to even be on the current major priority rotation. Once ++woken up, though, they get to use up a their full quota for that epoch, ++whether part of a quota remains or a full quota. Overall, however, they ++can still only run as much cpu time for that epoch as any other task of the ++same nice level. This means that two tasks behaving completely differently ++from fully cpu bound to waking/sleeping extremely frequently will still ++get the same quota of cpu, but the latter will be using its quota for that ++epoch in bursts rather than continuously. This guarantees that interactive ++tasks get the same amount of cpu as cpu bound ones. ++ ++The other requirement of interactive tasks is also to obtain low latencies ++for when they are scheduled. Unlike fully cpu bound tasks and the maximum ++latencies possible described in the modelling deadline behaviour section ++above, tasks that sleep will wake up with quota available usually at the ++current runqueue's priority_level or better. This means that the most latency ++they are likely to see is one RR_INTERVAL, and often they will preempt the ++current task if it is not of a sleeping nature. This then guarantees very ++low latency for interactive tasks, and the lowest latencies for the least ++cpu bound tasks. ++ ++ ++Fri, 4 May 2007 ++Con Kolivas +Index: linux-2.6.21-ck1/kernel/softirq.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/softirq.c 2007-05-04 12:10:52.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/softirq.c 2007-05-04 12:10:54.000000000 +1000 +@@ -488,7 +488,7 @@ void __init softirq_init(void) + + static int ksoftirqd(void * __bind_cpu) + { +- set_user_nice(current, 19); ++ set_user_nice(current, 15); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); +Index: linux-2.6.21-ck1/kernel/fork.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/fork.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/fork.c 2007-05-04 12:24:19.000000000 +1000 +@@ -1060,6 +1060,7 @@ static struct task_struct *copy_process( + p->io_context = NULL; + p->io_wait = NULL; + p->audit_context = NULL; ++ p->mutexes_held = 0; + cpuset_fork(p); + #ifdef CONFIG_NUMA + p->mempolicy = mpol_copy(p->mempolicy); +Index: linux-2.6.21-ck1/kernel/mutex.c +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/mutex.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/mutex.c 2007-05-04 12:24:19.000000000 +1000 +@@ -60,6 +60,16 @@ EXPORT_SYMBOL(__mutex_init); + static void fastcall noinline __sched + __mutex_lock_slowpath(atomic_t *lock_count); + ++static inline void inc_mutex_count(void) ++{ ++ current->mutexes_held++; ++} ++ ++static inline void dec_mutex_count(void) ++{ ++ current->mutexes_held--; ++} ++ + /*** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired +@@ -89,6 +99,7 @@ void inline fastcall __sched mutex_lock( + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); ++ inc_mutex_count(); + } + + EXPORT_SYMBOL(mutex_lock); +@@ -114,6 +125,7 @@ void fastcall __sched mutex_unlock(struc + * into 'unlocked' state: + */ + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); ++ dec_mutex_count(); + } + + EXPORT_SYMBOL(mutex_unlock); +@@ -283,9 +295,14 @@ __mutex_lock_interruptible_slowpath(atom + */ + int fastcall __sched mutex_lock_interruptible(struct mutex *lock) + { ++ int ret; ++ + might_sleep(); +- return __mutex_fastpath_lock_retval ++ ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_interruptible_slowpath); ++ if (likely(!ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_lock_interruptible); +@@ -340,8 +357,12 @@ static inline int __mutex_trylock_slowpa + */ + int fastcall __sched mutex_trylock(struct mutex *lock) + { +- return __mutex_fastpath_trylock(&lock->count, ++ int ret = __mutex_fastpath_trylock(&lock->count, + __mutex_trylock_slowpath); ++ ++ if (likely(ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_trylock); +Index: linux-2.6.21-ck1/block/cfq-iosched.c +=================================================================== +--- linux-2.6.21-ck1.orig/block/cfq-iosched.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/block/cfq-iosched.c 2007-05-04 12:24:19.000000000 +1000 +@@ -1258,10 +1258,12 @@ static void cfq_init_prio_data(struct cf + printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* +- * no prio set, place us in the middle of the BE classes ++ * Select class and ioprio according to policy and nice + */ ++ cfqq->ioprio_class = task_policy_ioprio_class(tsk); + cfqq->ioprio = task_nice_ioprio(tsk); +- cfqq->ioprio_class = IOPRIO_CLASS_BE; ++ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) ++ cfq_clear_cfqq_idle_window(cfqq); + break; + case IOPRIO_CLASS_RT: + cfqq->ioprio = task_ioprio(tsk); +Index: linux-2.6.21-ck1/include/linux/ioprio.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/ioprio.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/ioprio.h 2007-05-04 12:24:19.000000000 +1000 +@@ -22,7 +22,7 @@ + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +-enum { ++enum ioprio_class { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, +@@ -51,8 +51,25 @@ static inline int task_ioprio(struct tas + return IOPRIO_PRIO_DATA(task->ioprio); + } + ++static inline enum ioprio_class ++ task_policy_ioprio_class(struct task_struct *task) ++{ ++ if (rt_task(task)) ++ return IOPRIO_CLASS_RT; ++ if (idleprio_task(task)) ++ return IOPRIO_CLASS_IDLE; ++ return IOPRIO_CLASS_BE; ++} ++ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (rt_task(task)) ++ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / ++ (MAX_RT_PRIO + 1); ++ if (iso_task(task)) ++ return 0; ++ if (idleprio_task(task)) ++ return IOPRIO_BE_NR - 1; + return (task_nice(task) + 20) / 5; + } + +Index: linux-2.6.21-ck1/Documentation/sysctl/vm.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/sysctl/vm.txt 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/sysctl/vm.txt 2007-05-04 12:24:21.000000000 +1000 +@@ -22,6 +22,8 @@ Currently, these files are in /proc/sys/ + - dirty_background_ratio + - dirty_expire_centisecs + - dirty_writeback_centisecs ++- hardmaplimit ++- mapped + - max_map_count + - min_free_kbytes + - laptop_mode +@@ -31,12 +33,13 @@ Currently, these files are in /proc/sys/ + - min_unmapped_ratio + - min_slab_ratio + - panic_on_oom ++- swap_prefetch + + ============================================================== + + dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, + dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, +-block_dump, swap_token_timeout, drop-caches: ++block_dump, swap_token_timeout, drop-caches, tail_largefiles: + + See Documentation/filesystems/proc.txt + +@@ -86,6 +89,27 @@ for swap because we only cluster swap da + + ============================================================== + ++hardmaplimit: ++ ++This flag makes the vm adhere to the mapped value as closely as possible ++except in the most extreme vm stress where doing so would provoke an out ++of memory condition (see mapped below). ++ ++Enabled by default. ++ ++============================================================== ++ ++mapped: ++ ++This is the percentage ram that is filled with mapped pages (applications) ++before the vm will start reclaiming mapped pages by moving them to swap. ++It is altered by the relative stress of the vm at the time so is not ++strictly adhered to to prevent provoking out of memory kills. ++ ++Set to 66 by default. ++ ++============================================================== ++ + max_map_count: + + This file contains the maximum number of memory map areas a process +@@ -205,3 +229,14 @@ rather than killing rogue processes, set + + The default value is 0. + ++============================================================== ++ ++swap_prefetch ++ ++This enables or disables the swap prefetching feature. When the virtual ++memory subsystem has been extremely idle for at least 5 seconds it will start ++copying back pages from swap into the swapcache and keep a copy in swap. In ++practice it can take many minutes before the vm is idle enough. ++ ++The default value is 1. ++ +Index: linux-2.6.21-ck1/include/linux/swap.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/swap.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/swap.h 2007-05-04 12:24:20.000000000 +1000 +@@ -180,6 +180,7 @@ extern unsigned int nr_free_pagecache_pa + /* linux/mm/swap.c */ + extern void FASTCALL(lru_cache_add(struct page *)); + extern void FASTCALL(lru_cache_add_active(struct page *)); ++extern void FASTCALL(lru_cache_add_tail(struct page *)); + extern void FASTCALL(activate_page(struct page *)); + extern void FASTCALL(mark_page_accessed(struct page *)); + extern void lru_add_drain(void); +@@ -188,9 +189,11 @@ extern int rotate_reclaimable_page(struc + extern void swap_setup(void); + + /* linux/mm/vmscan.c */ +-extern unsigned long try_to_free_pages(struct zone **, gfp_t); ++extern unsigned long try_to_free_pages(struct zone **, gfp_t, ++ struct task_struct *p); + extern unsigned long shrink_all_memory(unsigned long nr_pages); +-extern int vm_swappiness; ++extern int vm_mapped; ++extern int vm_hardmaplimit; + extern int remove_mapping(struct address_space *mapping, struct page *page); + extern long vm_total_pages; + +@@ -237,6 +240,7 @@ extern void free_pages_and_swap_cache(st + extern struct page * lookup_swap_cache(swp_entry_t); + extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, + unsigned long addr); ++extern int add_to_swap_cache(struct page *page, swp_entry_t entry); + /* linux/mm/swapfile.c */ + extern long total_swap_pages; + extern unsigned int nr_swapfiles; +Index: linux-2.6.21-ck1/init/Kconfig +=================================================================== +--- linux-2.6.21-ck1.orig/init/Kconfig 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/init/Kconfig 2007-05-04 12:24:20.000000000 +1000 +@@ -101,6 +101,28 @@ config SWAP + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + ++config SWAP_PREFETCH ++ bool "Support for prefetching swapped memory" ++ depends on SWAP ++ default y ++ ---help--- ++ This option will allow the kernel to prefetch swapped memory pages ++ when idle. The pages will be kept on both swap and in swap_cache ++ thus avoiding the need for further I/O if either ram or swap space ++ is required. ++ ++ What this will do on workstations is slowly bring back applications ++ that have swapped out after memory intensive workloads back into ++ physical ram if you have free ram at a later stage and the machine ++ is relatively idle. This means that when you come back to your ++ computer after leaving it idle for a while, applications will come ++ to life faster. Note that your swap usage will appear to increase ++ but these are cached pages, can be dropped freely by the vm, and it ++ should stabilise around 50% swap usage maximum. ++ ++ Workstations and multiuser workstation servers will most likely want ++ to say Y. ++ + config SYSVIPC + bool "System V IPC" + ---help--- +Index: linux-2.6.21-ck1/mm/Makefile +=================================================================== +--- linux-2.6.21-ck1.orig/mm/Makefile 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/Makefile 2007-05-04 12:24:20.000000000 +1000 +@@ -17,6 +17,7 @@ ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) + obj-y += bounce.o + endif + obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o ++obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o + obj-$(CONFIG_HUGETLBFS) += hugetlb.o + obj-$(CONFIG_NUMA) += mempolicy.o + obj-$(CONFIG_SPARSEMEM) += sparse.o +Index: linux-2.6.21-ck1/mm/swap.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/swap.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/swap.c 2007-05-04 12:24:21.000000000 +1000 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -176,6 +177,7 @@ EXPORT_SYMBOL(mark_page_accessed); + */ + static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; + static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; ++static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; + + void fastcall lru_cache_add(struct page *page) + { +@@ -197,6 +199,31 @@ void fastcall lru_cache_add_active(struc + put_cpu_var(lru_add_active_pvecs); + } + ++static void __pagevec_lru_add_tail(struct pagevec *pvec) ++{ ++ int i; ++ struct zone *zone = NULL; ++ ++ for (i = 0; i < pagevec_count(pvec); i++) { ++ struct page *page = pvec->pages[i]; ++ struct zone *pagezone = page_zone(page); ++ ++ if (pagezone != zone) { ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ zone = pagezone; ++ spin_lock_irq(&zone->lru_lock); ++ } ++ BUG_ON(PageLRU(page)); ++ SetPageLRU(page); ++ add_page_to_inactive_list_tail(zone, page); ++ } ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ release_pages(pvec->pages, pvec->nr, pvec->cold); ++ pagevec_reinit(pvec); ++} ++ + static void __lru_add_drain(int cpu) + { + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); +@@ -207,6 +234,9 @@ static void __lru_add_drain(int cpu) + pvec = &per_cpu(lru_add_active_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_active(pvec); ++ pvec = &per_cpu(lru_add_tail_pvecs, cpu); ++ if (pagevec_count(pvec)) ++ __pagevec_lru_add_tail(pvec); + } + + void lru_add_drain(void) +@@ -403,6 +433,20 @@ void __pagevec_lru_add_active(struct pag + } + + /* ++ * Function used uniquely to put pages back to the lru at the end of the ++ * inactive list to preserve the lru order. ++ */ ++void fastcall lru_cache_add_tail(struct page *page) ++{ ++ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); ++ ++ page_cache_get(page); ++ if (!pagevec_add(pvec, page)) ++ __pagevec_lru_add_tail(pvec); ++ put_cpu_var(lru_add_pvecs); ++} ++ ++/* + * Try to drop buffers from the pages in a pagevec + */ + void pagevec_strip(struct pagevec *pvec) +@@ -514,6 +558,9 @@ void __init swap_setup(void) + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ ++ ++ prepare_swap_prefetch(); ++ + #ifdef CONFIG_HOTPLUG_CPU + hotcpu_notifier(cpu_swap_callback, 0); + #endif +Index: linux-2.6.21-ck1/mm/swap_prefetch.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.21-ck1/mm/swap_prefetch.c 2007-05-04 12:24:20.000000000 +1000 +@@ -0,0 +1,581 @@ ++/* ++ * linux/mm/swap_prefetch.c ++ * ++ * Copyright (C) 2005-2006 Con Kolivas ++ * ++ * Written by Con Kolivas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Time to delay prefetching if vm is busy or prefetching unsuccessful. There ++ * needs to be at least this duration of idle time meaning in practice it can ++ * be much longer ++ */ ++#define PREFETCH_DELAY (HZ * 5) ++ ++/* sysctl - enable/disable swap prefetching */ ++int swap_prefetch __read_mostly = 1; ++ ++struct swapped_root { ++ unsigned long busy; /* vm busy */ ++ spinlock_t lock; /* protects all data */ ++ struct list_head list; /* MRU list of swapped pages */ ++ struct radix_tree_root swap_tree; /* Lookup tree of pages */ ++ unsigned int count; /* Number of entries */ ++ unsigned int maxcount; /* Maximum entries allowed */ ++ struct kmem_cache *cache; /* Of struct swapped_entry */ ++}; ++ ++static struct swapped_root swapped = { ++ .lock = SPIN_LOCK_UNLOCKED, ++ .list = LIST_HEAD_INIT(swapped.list), ++ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), ++}; ++ ++static struct task_struct *kprefetchd_task; ++ ++/* ++ * We check to see no part of the vm is busy. If it is this will interrupt ++ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. ++ */ ++inline void delay_swap_prefetch(void) ++{ ++ if (!test_bit(0, &swapped.busy)) ++ __set_bit(0, &swapped.busy); ++} ++ ++/* ++ * Drop behind accounting which keeps a list of the most recently used swap ++ * entries. ++ */ ++void add_to_swapped_list(struct page *page) ++{ ++ struct swapped_entry *entry; ++ unsigned long index, flags; ++ int wakeup; ++ ++ if (!swap_prefetch) ++ return; ++ ++ wakeup = 0; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (swapped.count >= swapped.maxcount) { ++ /* ++ * We limit the number of entries to 2/3 of physical ram. ++ * Once the number of entries exceeds this we start removing ++ * the least recently used entries. ++ */ ++ entry = list_entry(swapped.list.next, ++ struct swapped_entry, swapped_list); ++ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); ++ list_del(&entry->swapped_list); ++ swapped.count--; ++ } else { ++ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); ++ if (unlikely(!entry)) ++ /* bad, can't allocate more mem */ ++ goto out_locked; ++ } ++ ++ index = page_private(page); ++ entry->swp_entry.val = index; ++ /* ++ * On numa we need to store the node id to ensure that we prefetch to ++ * the same node it came from. ++ */ ++ store_swap_entry_node(entry, page); ++ ++ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { ++ /* ++ * If this is the first entry, kprefetchd needs to be ++ * (re)started. ++ */ ++ if (!swapped.count) ++ wakeup = 1; ++ list_add(&entry->swapped_list, &swapped.list); ++ swapped.count++; ++ } ++ ++out_locked: ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++ /* Do the wakeup outside the lock to shorten lock hold time. */ ++ if (wakeup) ++ wake_up_process(kprefetchd_task); ++ ++ return; ++} ++ ++/* ++ * Removes entries from the swapped_list. The radix tree allows us to quickly ++ * look up the entry from the index without having to iterate over the whole ++ * list. ++ */ ++void remove_from_swapped_list(const unsigned long index) ++{ ++ struct swapped_entry *entry; ++ unsigned long flags; ++ ++ if (list_empty(&swapped.list)) ++ return; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ entry = radix_tree_delete(&swapped.swap_tree, index); ++ if (likely(entry)) { ++ list_del_init(&entry->swapped_list); ++ swapped.count--; ++ kmem_cache_free(swapped.cache, entry); ++ } ++ spin_unlock_irqrestore(&swapped.lock, flags); ++} ++ ++enum trickle_return { ++ TRICKLE_SUCCESS, ++ TRICKLE_FAILED, ++ TRICKLE_DELAY, ++}; ++ ++struct node_stats { ++ unsigned long last_free; ++ /* Free ram after a cycle of prefetching */ ++ unsigned long current_free; ++ /* Free ram on this cycle of checking prefetch_suitable */ ++ unsigned long prefetch_watermark; ++ /* Maximum amount we will prefetch to */ ++ unsigned long highfree[MAX_NR_ZONES]; ++ /* The amount of free ram before we start prefetching */ ++ unsigned long lowfree[MAX_NR_ZONES]; ++ /* The amount of free ram where we will stop prefetching */ ++ unsigned long *pointfree[MAX_NR_ZONES]; ++ /* highfree or lowfree depending on whether we've hit a watermark */ ++}; ++ ++/* ++ * prefetch_stats stores the free ram data of each node and this is used to ++ * determine if a node is suitable for prefetching into. ++ */ ++struct prefetch_stats { ++ nodemask_t prefetch_nodes; ++ /* Which nodes are currently suited to prefetching */ ++ unsigned long prefetched_pages; ++ /* Total pages we've prefetched on this wakeup of kprefetchd */ ++ struct node_stats node[MAX_NUMNODES]; ++}; ++ ++static struct prefetch_stats sp_stat; ++ ++/* ++ * This tries to read a swp_entry_t into swap cache for swap prefetching. ++ * If it returns TRICKLE_DELAY we should delay further prefetching. ++ */ ++static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, ++ const int node) ++{ ++ enum trickle_return ret = TRICKLE_FAILED; ++ struct page *page; ++ ++ read_lock_irq(&swapper_space.tree_lock); ++ /* Entry may already exist */ ++ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); ++ read_unlock_irq(&swapper_space.tree_lock); ++ if (page) { ++ remove_from_swapped_list(entry.val); ++ goto out; ++ } ++ ++ /* ++ * Get a new page to read from swap. We have already checked the ++ * watermarks so __alloc_pages will not call on reclaim. ++ */ ++ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); ++ if (unlikely(!page)) { ++ ret = TRICKLE_DELAY; ++ goto out; ++ } ++ ++ if (add_to_swap_cache(page, entry)) { ++ /* Failed to add to swap cache */ ++ goto out_release; ++ } ++ ++ /* Add them to the tail of the inactive list to preserve LRU order */ ++ lru_cache_add_tail(page); ++ if (unlikely(swap_readpage(NULL, page))) { ++ ret = TRICKLE_DELAY; ++ goto out_release; ++ } ++ ++ sp_stat.prefetched_pages++; ++ sp_stat.node[node].last_free--; ++ ++ ret = TRICKLE_SUCCESS; ++out_release: ++ page_cache_release(page); ++out: ++ return ret; ++} ++ ++static void clear_last_prefetch_free(void) ++{ ++ int node; ++ ++ /* ++ * Reset the nodes suitable for prefetching to all nodes. We could ++ * update the data to take into account memory hotplug if desired.. ++ */ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->last_free = 0; ++ } ++} ++ ++static void clear_current_prefetch_free(void) ++{ ++ int node; ++ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->current_free = 0; ++ } ++} ++ ++/* ++ * This updates the high and low watermarks of amount of free ram in each ++ * node used to start and stop prefetching. We prefetch from pages_high * 4 ++ * down to pages_high * 3. ++ */ ++static void examine_free_limits(void) ++{ ++ struct zone *z; ++ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ ns = &sp_stat.node[z->zone_pgdat->node_id]; ++ idx = zone_idx(z); ++ ns->lowfree[idx] = z->pages_high * 3; ++ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; ++ ++ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { ++ /* ++ * We've gotten above the high watermark of free pages ++ * so we can start prefetching till we get to the low ++ * watermark. ++ */ ++ ns->pointfree[idx] = &ns->lowfree[idx]; ++ } ++ } ++} ++ ++/* ++ * We want to be absolutely certain it's ok to start prefetching. ++ */ ++static int prefetch_suitable(void) ++{ ++ unsigned long limit; ++ struct zone *z; ++ int node, ret = 0, test_pagestate = 0; ++ ++ /* Purposefully racy */ ++ if (test_bit(0, &swapped.busy)) { ++ __clear_bit(0, &swapped.busy); ++ goto out; ++ } ++ ++ /* ++ * get_page_state and above_background_load are expensive so we only ++ * perform them every SWAP_CLUSTER_MAX prefetched_pages. ++ * We test to see if we're above_background_load as disk activity ++ * even at low priority can cause interrupt induced scheduling ++ * latencies. ++ */ ++ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) { ++ if (above_background_load()) ++ goto out; ++ test_pagestate = 1; ++ } ++ ++ clear_current_prefetch_free(); ++ ++ /* ++ * Have some hysteresis between where page reclaiming and prefetching ++ * will occur to prevent ping-ponging between them. ++ */ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ unsigned long free; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ node = z->zone_pgdat->node_id; ++ ns = &sp_stat.node[node]; ++ idx = zone_idx(z); ++ ++ free = zone_page_state(z, NR_FREE_PAGES); ++ if (free < *ns->pointfree[idx]) { ++ /* ++ * Free pages have dropped below the low watermark so ++ * we won't start prefetching again till we hit the ++ * high watermark of free pages. ++ */ ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ ns->current_free += free; ++ } ++ ++ /* ++ * We iterate over each node testing to see if it is suitable for ++ * prefetching and clear the nodemask if it is not. ++ */ ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ /* ++ * We check to see that pages are not being allocated ++ * elsewhere at any significant rate implying any ++ * degree of memory pressure (eg during file reads) ++ */ ++ if (ns->last_free) { ++ if (ns->current_free + SWAP_CLUSTER_MAX < ++ ns->last_free) { ++ ns->last_free = ns->current_free; ++ node_clear(node, ++ sp_stat.prefetch_nodes); ++ continue; ++ } ++ } else ++ ns->last_free = ns->current_free; ++ ++ if (!test_pagestate) ++ continue; ++ ++ /* We shouldn't prefetch when we are doing writeback */ ++ if (node_page_state(node, NR_WRITEBACK)) { ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ ++ /* ++ * >2/3 of the ram on this node is mapped, slab, swapcache or ++ * dirty, we need to leave some free for pagecache. ++ */ ++ limit = node_page_state(node, NR_FILE_PAGES); ++ limit += node_page_state(node, NR_SLAB_RECLAIMABLE); ++ limit += node_page_state(node, NR_SLAB_UNRECLAIMABLE); ++ limit += node_page_state(node, NR_FILE_DIRTY); ++ limit += node_page_state(node, NR_UNSTABLE_NFS); ++ limit += total_swapcache_pages; ++ if (limit > ns->prefetch_watermark) { ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ } ++ ++ if (nodes_empty(sp_stat.prefetch_nodes)) ++ goto out; ++ ++ /* Survived all that? Hooray we can prefetch! */ ++ ret = 1; ++out: ++ return ret; ++} ++ ++/* ++ * Get previous swapped entry when iterating over all entries. swapped.lock ++ * should be held and we should already ensure that entry exists. ++ */ ++static inline struct swapped_entry *prev_swapped_entry ++ (struct swapped_entry *entry) ++{ ++ return list_entry(entry->swapped_list.prev->prev, ++ struct swapped_entry, swapped_list); ++} ++ ++/* ++ * trickle_swap is the main function that initiates the swap prefetching. It ++ * first checks to see if the busy flag is set, and does not prefetch if it ++ * is, as the flag implied we are low on memory or swapping in currently. ++ * Otherwise it runs until prefetch_suitable fails which occurs when the ++ * vm is busy, we prefetch to the watermark, or the list is empty or we have ++ * iterated over all entries ++ */ ++static enum trickle_return trickle_swap(void) ++{ ++ enum trickle_return ret = TRICKLE_DELAY; ++ struct swapped_entry *entry; ++ unsigned long flags; ++ ++ /* ++ * If laptop_mode is enabled don't prefetch to avoid hard drives ++ * doing unnecessary spin-ups ++ */ ++ if (!swap_prefetch || laptop_mode) ++ return ret; ++ ++ examine_free_limits(); ++ entry = NULL; ++ ++ for ( ; ; ) { ++ swp_entry_t swp_entry; ++ int node; ++ ++ if (!prefetch_suitable()) ++ break; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (list_empty(&swapped.list)) { ++ ret = TRICKLE_FAILED; ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ break; ++ } ++ ++ if (!entry) { ++ /* ++ * This sets the entry for the first iteration. It ++ * also is a safeguard against the entry disappearing ++ * while the lock is not held. ++ */ ++ entry = list_entry(swapped.list.prev, ++ struct swapped_entry, swapped_list); ++ } else if (entry->swapped_list.prev == swapped.list.next) { ++ /* ++ * If we have iterated over all entries and there are ++ * still entries that weren't swapped out there may ++ * be a reason we could not swap them back in so ++ * delay attempting further prefetching. ++ */ ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ break; ++ } ++ ++ node = get_swap_entry_node(entry); ++ if (!node_isset(node, sp_stat.prefetch_nodes)) { ++ /* ++ * We found an entry that belongs to a node that is ++ * not suitable for prefetching so skip it. ++ */ ++ entry = prev_swapped_entry(entry); ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ continue; ++ } ++ swp_entry = entry->swp_entry; ++ entry = prev_swapped_entry(entry); ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) ++ break; ++ } ++ ++ if (sp_stat.prefetched_pages) { ++ lru_add_drain(); ++ sp_stat.prefetched_pages = 0; ++ } ++ return ret; ++} ++ ++static int kprefetchd(void *__unused) ++{ ++ struct sched_param param = { .sched_priority = 0 }; ++ ++ sched_setscheduler(current, SCHED_BATCH, ¶m); ++ set_user_nice(current, 19); ++ /* Set ioprio to lowest if supported by i/o scheduler */ ++ sys_ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_CLASS_IDLE); ++ ++ /* kprefetchd has nothing to do until it is woken up the first time */ ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ ++ do { ++ try_to_freeze(); ++ ++ /* ++ * TRICKLE_FAILED implies no entries left - we do not schedule ++ * a wakeup, and further delay the next one. ++ */ ++ if (trickle_swap() == TRICKLE_FAILED) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ } ++ clear_last_prefetch_free(); ++ schedule_timeout_interruptible(PREFETCH_DELAY); ++ } while (!kthread_should_stop()); ++ ++ return 0; ++} ++ ++/* ++ * Create kmem cache for swapped entries ++ */ ++void __init prepare_swap_prefetch(void) ++{ ++ struct zone *zone; ++ ++ swapped.cache = kmem_cache_create("swapped_entry", ++ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); ++ ++ /* ++ * Set max number of entries to 2/3 the size of physical ram as we ++ * only ever prefetch to consume 2/3 of the ram. ++ */ ++ swapped.maxcount = nr_free_pagecache_pages() / 3 * 2; ++ ++ for_each_zone(zone) { ++ unsigned long present; ++ struct node_stats *ns; ++ int idx; ++ ++ present = zone->present_pages; ++ if (!present) ++ continue; ++ ++ ns = &sp_stat.node[zone->zone_pgdat->node_id]; ++ ns->prefetch_watermark += present / 3 * 2; ++ idx = zone_idx(zone); ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ } ++} ++ ++static int __init kprefetchd_init(void) ++{ ++ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); ++ ++ return 0; ++} ++ ++static void __exit kprefetchd_exit(void) ++{ ++ kthread_stop(kprefetchd_task); ++} ++ ++module_init(kprefetchd_init); ++module_exit(kprefetchd_exit); +Index: linux-2.6.21-ck1/mm/swap_state.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/swap_state.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/swap_state.c 2007-05-04 12:24:20.000000000 +1000 +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -82,6 +83,7 @@ static int __add_to_swap_cache(struct pa + error = radix_tree_insert(&swapper_space.page_tree, + entry.val, page); + if (!error) { ++ remove_from_swapped_list(entry.val); + page_cache_get(page); + SetPageLocked(page); + SetPageSwapCache(page); +@@ -95,11 +97,12 @@ static int __add_to_swap_cache(struct pa + return error; + } + +-static int add_to_swap_cache(struct page *page, swp_entry_t entry) ++int add_to_swap_cache(struct page *page, swp_entry_t entry) + { + int error; + + if (!swap_duplicate(entry)) { ++ remove_from_swapped_list(entry.val); + INC_CACHE_INFO(noent_race); + return -ENOENT; + } +@@ -148,6 +151,9 @@ int add_to_swap(struct page * page, gfp_ + swp_entry_t entry; + int err; + ++ /* Swap prefetching is delayed if we're swapping pages */ ++ delay_swap_prefetch(); ++ + BUG_ON(!PageLocked(page)); + + for (;;) { +@@ -320,6 +326,9 @@ struct page *read_swap_cache_async(swp_e + struct page *found_page, *new_page = NULL; + int err; + ++ /* Swap prefetching is delayed if we're already reading from swap */ ++ delay_swap_prefetch(); ++ + do { + /* + * First check the swap cache. Since this is normally +Index: linux-2.6.21-ck1/mm/vmscan.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/vmscan.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/vmscan.c 2007-05-04 12:24:21.000000000 +1000 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -36,6 +37,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -63,7 +65,7 @@ struct scan_control { + * whole list at once. */ + int swap_cluster_max; + +- int swappiness; ++ int mapped; + + int all_unreclaimable; + }; +@@ -110,9 +112,10 @@ struct shrinker { + #endif + + /* +- * From 0 .. 100. Higher means more swappy. ++ * From 0 .. 100. Lower means more swappy. + */ +-int vm_swappiness = 60; ++int vm_mapped __read_mostly = 66; ++int vm_hardmaplimit __read_mostly = 1; + long vm_total_pages; /* The total number of pages which the VM controls */ + + static LIST_HEAD(shrinker_list); +@@ -424,6 +427,7 @@ int remove_mapping(struct address_space + + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page_private(page) }; ++ add_to_swapped_list(page); + __delete_from_swap_cache(page); + write_unlock_irq(&mapping->tree_lock); + swap_free(swap); +@@ -807,10 +811,14 @@ static void shrink_active_list(unsigned + * The distress ratio is important - we don't want to start + * going oom. + * +- * A 100% value of vm_swappiness overrides this algorithm +- * altogether. ++ * This distress value is ignored if we apply a hardmaplimit except ++ * in extreme distress. ++ * ++ * A 0% value of vm_mapped overrides this algorithm altogether. + */ +- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; ++ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); ++ if (!vm_hardmaplimit || distress == 100) ++ swap_tendency += distress; + + /* + * Now use this metric to decide whether to start moving mapped +@@ -959,6 +967,41 @@ static unsigned long shrink_zone(int pri + } + + /* ++ * Helper functions to adjust nice level of kswapd, based on the priority of ++ * the task (p) that called it. If it is already higher priority we do not ++ * demote its nice level since it is still working on behalf of a higher ++ * priority task. With kernel threads we leave it at nice 0. ++ * ++ * We don't ever run kswapd real time, so if a real time task calls kswapd we ++ * set it to highest SCHED_NORMAL priority. ++ */ ++static int effective_sc_prio(struct task_struct *p) ++{ ++ if (likely(p->mm)) { ++ if (rt_task(p)) ++ return -20; ++ if (idleprio_task(p)) ++ return 19; ++ return task_nice(p); ++ } ++ return 0; ++} ++ ++static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, ++ int active) ++{ ++ long nice = effective_sc_prio(p); ++ ++ if (task_nice(kswapd) > nice || !active) ++ set_user_nice(kswapd, nice); ++} ++ ++static int sc_priority(struct task_struct *p) ++{ ++ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); ++} ++ ++/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. +@@ -1015,7 +1058,8 @@ static unsigned long shrink_zones(int pr + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + */ +-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) ++unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ++ struct task_struct *p) + { + int priority; + int ret = 0; +@@ -1023,15 +1067,20 @@ unsigned long try_to_free_pages(struct z + unsigned long nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages = 0; +- int i; ++ int i, scan_priority = DEF_PRIORITY; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + ++ if (p) ++ scan_priority = sc_priority(p); ++ ++ delay_swap_prefetch(); ++ + count_vm_event(ALLOCSTALL); + + for (i = 0; zones[i] != NULL; i++) { +@@ -1044,7 +1093,7 @@ unsigned long try_to_free_pages(struct z + + zone_page_state(zone, NR_INACTIVE); + } + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + sc.nr_scanned = 0; + if (!priority) + disable_swap_token(); +@@ -1074,7 +1123,7 @@ unsigned long try_to_free_pages(struct z + } + + /* Take a nap, wait for some writeback to complete */ +- if (sc.nr_scanned && priority < DEF_PRIORITY - 2) ++ if (sc.nr_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + } + /* top priority shrink_caches still had more to do? don't OOM, then */ +@@ -1124,9 +1173,9 @@ out: + */ + static unsigned long balance_pgdat(pg_data_t *pgdat, int order) + { +- int all_zones_ok; ++ int all_zones_ok = 0; + int priority; +- int i; ++ int i, scan_priority; + unsigned long total_scanned; + unsigned long nr_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; +@@ -1134,7 +1183,7 @@ static unsigned long balance_pgdat(pg_da + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + /* + * temp_priority is used to remember the scanning priority at which +@@ -1142,6 +1191,8 @@ static unsigned long balance_pgdat(pg_da + */ + int temp_priority[MAX_NR_ZONES]; + ++ scan_priority = sc_priority(pgdat->kswapd); ++ + loop_again: + total_scanned = 0; + nr_reclaimed = 0; +@@ -1149,9 +1200,9 @@ loop_again: + count_vm_event(PAGEOUTRUN); + + for (i = 0; i < pgdat->nr_zones; i++) +- temp_priority[i] = DEF_PRIORITY; ++ temp_priority[i] = scan_priority; + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + +@@ -1167,15 +1218,22 @@ loop_again: + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, +- 0, 0)) { ++ /* ++ * The watermark is relaxed depending on the ++ * level of "priority" till it drops to ++ * pages_high. ++ */ ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { + end_zone = i; + break; + } +@@ -1202,14 +1260,18 @@ loop_again: + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int nr_slab; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ ++ if (!zone_watermark_ok(zone, order, watermark, + end_zone, 0)) + all_zones_ok = 0; + temp_priority[i] = priority; +@@ -1242,7 +1304,7 @@ loop_again: + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ +- if (total_scanned && priority < DEF_PRIORITY - 2) ++ if (total_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + + /* +@@ -1276,6 +1338,8 @@ out: + return nr_reclaimed; + } + ++#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ ++ + /* + * The background pageout daemon, started as a kernel thread + * from the init process. +@@ -1325,6 +1389,8 @@ static int kswapd(void *p) + + try_to_freeze(); + ++ /* kswapd has been busy so delay watermark_timer */ ++ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; +@@ -1335,6 +1401,7 @@ static int kswapd(void *p) + */ + order = new_order; + } else { ++ set_user_nice(tsk, 0); + schedule(); + order = pgdat->kswapd_max_order; + } +@@ -1348,9 +1415,10 @@ static int kswapd(void *p) + /* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +-void wakeup_kswapd(struct zone *zone, int order) ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) + { + pg_data_t *pgdat; ++ int active; + + if (!populated_zone(zone)) + return; +@@ -1362,7 +1430,9 @@ void wakeup_kswapd(struct zone *zone, in + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; +- if (!waitqueue_active(&pgdat->kswapd_wait)) ++ active = waitqueue_active(&pgdat->kswapd_wait); ++ set_kswapd_nice(pgdat->kswapd, p, active); ++ if (!active) + return; + wake_up_interruptible(&pgdat->kswapd_wait); + } +@@ -1381,6 +1451,8 @@ static unsigned long shrink_all_zones(un + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + ++ delay_swap_prefetch(); ++ + for_each_zone(zone) { + + if (!populated_zone(zone)) +@@ -1440,7 +1512,7 @@ unsigned long shrink_all_memory(unsigned + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + + current->reclaim_state = &reclaim_state; +@@ -1475,7 +1547,7 @@ unsigned long shrink_all_memory(unsigned + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; +- sc.swappiness = 100; ++ sc.mapped = 0; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { +@@ -1539,20 +1611,57 @@ static int __devinit cpu_callback(struct + } + + /* ++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots ++ */ ++static void watermark_wakeup(unsigned long data) ++{ ++ pg_data_t *pgdat = (pg_data_t *)data; ++ struct timer_list *wt = &pgdat->watermark_timer; ++ int i; ++ ++ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) ++ goto out; ++ for (i = pgdat->nr_zones - 1; i >= 0; i--) { ++ struct zone *z = pgdat->node_zones + i; ++ ++ if (!populated_zone(z) || is_highmem(z)) { ++ /* We are better off leaving highmem full */ ++ continue; ++ } ++ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { ++ wake_up_interruptible(&pgdat->kswapd_wait); ++ goto out; ++ } ++ } ++out: ++ mod_timer(wt, jiffies + WT_EXPIRY); ++ return; ++} ++ ++/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ + int kswapd_run(int nid) + { + pg_data_t *pgdat = NODE_DATA(nid); ++ struct timer_list *wt; + int ret = 0; + + if (pgdat->kswapd) + return 0; + ++ wt = &pgdat->watermark_timer; ++ init_timer(wt); ++ wt->data = (unsigned long)pgdat; ++ wt->function = watermark_wakeup; ++ wt->expires = jiffies + WT_EXPIRY; ++ add_timer(wt); ++ + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ ++ del_timer(wt); + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; +@@ -1623,7 +1732,7 @@ static int __zone_reclaim(struct zone *z + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + unsigned long slab_reclaimable; + +Index: linux-2.6.21-ck1/include/linux/mm_inline.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/mm_inline.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/mm_inline.h 2007-05-04 12:24:20.000000000 +1000 +@@ -13,6 +13,13 @@ add_page_to_inactive_list(struct zone *z + } + + static inline void ++add_page_to_inactive_list_tail(struct zone *zone, struct page *page) ++{ ++ list_add_tail(&page->lru, &zone->inactive_list); ++ __inc_zone_state(zone, NR_INACTIVE); ++} ++ ++static inline void + del_page_from_active_list(struct zone *zone, struct page *page) + { + list_del(&page->lru); +Index: linux-2.6.21-ck1/include/linux/swap-prefetch.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.21-ck1/include/linux/swap-prefetch.h 2007-05-04 12:24:20.000000000 +1000 +@@ -0,0 +1,55 @@ ++#ifndef SWAP_PREFETCH_H_INCLUDED ++#define SWAP_PREFETCH_H_INCLUDED ++ ++#ifdef CONFIG_SWAP_PREFETCH ++/* mm/swap_prefetch.c */ ++extern int swap_prefetch; ++struct swapped_entry { ++ swp_entry_t swp_entry; /* The actual swap entry */ ++ struct list_head swapped_list; /* Linked list of entries */ ++#if MAX_NUMNODES > 1 ++ int node; /* Node id */ ++#endif ++} __attribute__((packed)); ++ ++static inline void store_swap_entry_node(struct swapped_entry *entry, ++ struct page *page) ++{ ++#if MAX_NUMNODES > 1 ++ entry->node = page_to_nid(page); ++#endif ++} ++ ++static inline int get_swap_entry_node(struct swapped_entry *entry) ++{ ++#if MAX_NUMNODES > 1 ++ return entry->node; ++#else ++ return 0; ++#endif ++} ++ ++extern void add_to_swapped_list(struct page *page); ++extern void remove_from_swapped_list(const unsigned long index); ++extern void delay_swap_prefetch(void); ++extern void prepare_swap_prefetch(void); ++ ++#else /* CONFIG_SWAP_PREFETCH */ ++static inline void add_to_swapped_list(struct page *__unused) ++{ ++} ++ ++static inline void prepare_swap_prefetch(void) ++{ ++} ++ ++static inline void remove_from_swapped_list(const unsigned long __unused) ++{ ++} ++ ++static inline void delay_swap_prefetch(void) ++{ ++} ++#endif /* CONFIG_SWAP_PREFETCH */ ++ ++#endif /* SWAP_PREFETCH_H_INCLUDED */ +Index: linux-2.6.21-ck1/include/linux/sysctl.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/sysctl.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/sysctl.h 2007-05-04 12:24:20.000000000 +1000 +@@ -190,7 +190,7 @@ enum + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ +- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ ++ VM_MAPPED=19, /* percent mapped min while evicting cache */ + VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ + VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ +Index: linux-2.6.21-ck1/include/linux/mmzone.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/mmzone.h 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/mmzone.h 2007-05-04 12:24:21.000000000 +1000 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -178,7 +179,7 @@ enum zone_type { + + struct zone { + /* Fields commonly accessed by the page allocator */ +- unsigned long pages_min, pages_low, pages_high; ++ unsigned long pages_min, pages_low, pages_high, pages_lots; + /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several +@@ -449,6 +450,7 @@ typedef struct pglist_data { + wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; + int kswapd_max_order; ++ struct timer_list watermark_timer; + } pg_data_t; + + #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) +@@ -465,7 +467,7 @@ typedef struct pglist_data { + void get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free); + void build_all_zonelists(void); +-void wakeup_kswapd(struct zone *zone, int order); ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); + int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); + enum memmap_context { +Index: linux-2.6.21-ck1/mm/page_alloc.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/page_alloc.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/page_alloc.c 2007-05-04 12:24:20.000000000 +1000 +@@ -1277,7 +1277,7 @@ restart: + goto nopage; + + for (z = zonelist->zones; *z; z++) +- wakeup_kswapd(*z, order); ++ wakeup_kswapd(*z, order, p); + + /* + * OK, we're below the kswapd watermark and have kicked background +@@ -1341,7 +1341,7 @@ nofail_alloc: + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + +- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); ++ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; +@@ -1597,6 +1597,7 @@ void show_free_areas(void) + " min:%lukB" + " low:%lukB" + " high:%lukB" ++ " lots:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" +@@ -1608,6 +1609,7 @@ void show_free_areas(void) + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), ++ K(zone->pages_lots), + K(zone_page_state(zone, NR_ACTIVE)), + K(zone_page_state(zone, NR_INACTIVE)), + K(zone->present_pages), +@@ -3146,6 +3148,7 @@ void setup_per_zone_pages_min(void) + + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); ++ zone->pages_lots = zone->pages_min + tmp; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + +Index: linux-2.6.21-ck1/fs/buffer.c +=================================================================== +--- linux-2.6.21-ck1.orig/fs/buffer.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/fs/buffer.c 2007-05-04 12:24:20.000000000 +1000 +@@ -363,7 +363,7 @@ static void free_more_memory(void) + for_each_online_pgdat(pgdat) { + zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; + if (*zones) +- try_to_free_pages(zones, GFP_NOFS); ++ try_to_free_pages(zones, GFP_NOFS, NULL); + } + } + +Index: linux-2.6.21-ck1/mm/filemap.c +=================================================================== +--- linux-2.6.21-ck1.orig/mm/filemap.c 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/mm/filemap.c 2007-05-04 12:24:21.000000000 +1000 +@@ -466,6 +466,16 @@ int add_to_page_cache_lru(struct page *p + return ret; + } + ++int add_to_page_cache_lru_tail(struct page *page, ++ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache(page, mapping, offset, gfp_mask); ++ ++ if (ret == 0) ++ lru_cache_add_tail(page); ++ return ret; ++} ++ + #ifdef CONFIG_NUMA + struct page *__page_cache_alloc(gfp_t gfp) + { +@@ -836,6 +846,34 @@ static void shrink_readahead_size_eio(st + ra->ra_pages /= 4; + } + ++/* ++ * Sysctl which determines whether we should read from large files to the ++ * tail of the inactive lru list. ++ */ ++int vm_tail_largefiles __read_mostly = 1; ++ ++static inline int nr_mapped(void) ++{ ++ return global_page_state(NR_FILE_MAPPED) + ++ global_page_state(NR_ANON_PAGES); ++} ++ ++/* ++ * This examines how large in pages a file size is and returns 1 if it is ++ * more than half the unmapped ram. Avoid doing read_page_state which is ++ * expensive unless we already know it is likely to be large enough. ++ */ ++static int large_isize(unsigned long nr_pages) ++{ ++ if (nr_pages * 6 > vm_total_pages) { ++ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); ++ ++ if (nr_pages * 2 > unmapped_ram) ++ return 1; ++ } ++ return 0; ++} ++ + /** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read +@@ -1044,8 +1082,19 @@ no_cached_page: + goto out; + } + } +- error = add_to_page_cache_lru(cached_page, mapping, +- index, GFP_KERNEL); ++ ++ /* ++ * If we know the file is large we add the pages read to the ++ * end of the lru as we're unlikely to be able to cache the ++ * whole file in ram so make those pages the first to be ++ * dropped if not referenced soon. ++ */ ++ if (vm_tail_largefiles && large_isize(end_index)) ++ error = add_to_page_cache_lru_tail(cached_page, ++ mapping, index, GFP_KERNEL); ++ else ++ error = add_to_page_cache_lru(cached_page, mapping, ++ index, GFP_KERNEL); + if (error) { + if (error == -EEXIST) + goto find_page; +Index: linux-2.6.21-ck1/Documentation/filesystems/proc.txt +=================================================================== +--- linux-2.6.21-ck1.orig/Documentation/filesystems/proc.txt 2007-05-04 12:24:01.000000000 +1000 ++++ linux-2.6.21-ck1/Documentation/filesystems/proc.txt 2007-05-04 12:24:21.000000000 +1000 +@@ -1325,6 +1325,14 @@ To free pagecache, dentries and inodes: + As this is a non-destructive operation and dirty objects are not freeable, the + user should run `sync' first. + ++tail_largefiles ++--------------- ++ ++When enabled reads from large files to the tail end of the inactive lru list. ++This means that any cache from reading large files is dropped very quickly, ++preventing loss of mapped ram and useful pagecache when large files are read. ++This does, however, make caching less effective when working with large files. ++ + + 2.5 /proc/sys/dev - Device specific parameters + ---------------------------------------------- +Index: linux-2.6.21-ck1/arch/i386/Kconfig +=================================================================== +--- linux-2.6.21-ck1.orig/arch/i386/Kconfig 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/i386/Kconfig 2007-05-04 12:24:21.000000000 +1000 +@@ -546,7 +546,7 @@ endchoice + + choice + depends on EXPERIMENTAL +- prompt "Memory split" if EMBEDDED ++ prompt "Memory split" + default VMSPLIT_3G + help + Select the desired split between kernel and user memory. +@@ -565,14 +565,14 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !HIGHMEM +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +Index: linux-2.6.21-ck1/kernel/Kconfig.hz +=================================================================== +--- linux-2.6.21-ck1.orig/kernel/Kconfig.hz 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/kernel/Kconfig.hz 2007-05-04 12:24:21.000000000 +1000 +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_1000 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -13,8 +13,7 @@ choice + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts +- per second. +- ++ per second.Laptops may also show improved battery life. + + config HZ_100 + bool "100 HZ" +@@ -23,13 +22,14 @@ choice + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEFAULT + bool "250 HZ" + help +- 250 Hz is a good compromise choice allowing server performance +- while also showing good interactive responsiveness even +- on SMP and NUMA systems. If you are going to be using NTSC video +- or multimedia, selected 300Hz instead. ++ 250 HZ is a lousy compromise choice allowing server interactivity ++ while also showing desktop throughput and no extra power saving on ++ laptops. Good for when you can't make up your mind. ++ ++ Recommend 100 or 1000 instead. + + config HZ_300 + bool "300 HZ" +@@ -45,12 +45,76 @@ choice + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + ++ config HZ_1500 ++ bool "1500 HZ" ++ help ++ 1500 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_2000 ++ bool "2000 HZ" ++ help ++ 2000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_3000 ++ bool "3000 HZ" ++ help ++ 3000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_4000 ++ bool "4000 HZ" ++ help ++ 4000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_5000 ++ bool "5000 HZ" ++ help ++ 5000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_7500 ++ bool "7500 HZ" ++ help ++ 7500 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_10000 ++ bool "10000 HZ" ++ help ++ 10000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ + endchoice + + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 ++ default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 1500 if HZ_1500 ++ default 2000 if HZ_2000 ++ default 3000 if HZ_3000 ++ default 4000 if HZ_4000 ++ default 5000 if HZ_5000 ++ default 7500 if HZ_7500 ++ default 10000 if HZ_10000 + +Index: linux-2.6.21-ck1/arch/i386/defconfig +=================================================================== +--- linux-2.6.21-ck1.orig/arch/i386/defconfig 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/i386/defconfig 2007-05-04 12:24:21.000000000 +1000 +@@ -214,10 +214,10 @@ CONFIG_MTRR=y + # CONFIG_IRQBALANCE is not set + CONFIG_SECCOMP=y + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + # CONFIG_KEXEC is not set + # CONFIG_CRASH_DUMP is not set + CONFIG_PHYSICAL_START=0x100000 +Index: linux-2.6.21-ck1/arch/x86_64/defconfig +=================================================================== +--- linux-2.6.21-ck1.orig/arch/x86_64/defconfig 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/x86_64/defconfig 2007-05-04 12:24:21.000000000 +1000 +@@ -178,10 +178,10 @@ CONFIG_PHYSICAL_START=0x200000 + CONFIG_SECCOMP=y + # CONFIG_CC_STACKPROTECTOR is not set + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + # CONFIG_REORDER is not set + CONFIG_K8_NB=y + CONFIG_GENERIC_HARDIRQS=y +Index: linux-2.6.21-ck1/include/linux/jiffies.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/jiffies.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/jiffies.h 2007-05-04 12:24:21.000000000 +1000 +@@ -29,6 +29,12 @@ + # define SHIFT_HZ 9 + #elif HZ >= 768 && HZ < 1536 + # define SHIFT_HZ 10 ++#elif HZ >= 1536 && HZ < 3072 ++# define SHIFT_HZ 11 ++#elif HZ >= 3072 && HZ < 6144 ++# define SHIFT_HZ 12 ++#elif HZ >= 6144 && HZ < 12288 ++# define SHIFT_HZ 13 + #else + # error You lose. + #endif +Index: linux-2.6.21-ck1/include/net/inet_timewait_sock.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/net/inet_timewait_sock.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/net/inet_timewait_sock.h 2007-05-04 12:24:21.000000000 +1000 +@@ -38,8 +38,8 @@ struct inet_hashinfo; + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +-#if HZ <= 16 || HZ > 4096 +-# error Unsupported: HZ <= 16 or HZ > 4096 ++#if HZ <= 16 || HZ > 16384 ++# error Unsupported: HZ <= 16 or HZ > 16384 + #elif HZ <= 32 + # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 64 +@@ -54,8 +54,12 @@ struct inet_hashinfo; + # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 2048 + # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +-#else ++#elif HZ <= 4096 + # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#elif HZ <= 8192 ++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#else ++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #endif + + /* TIME_WAIT reaping mechanism. */ +Index: linux-2.6.21-ck1/include/net/pkt_sched.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/net/pkt_sched.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/net/pkt_sched.h 2007-05-04 12:24:21.000000000 +1000 +@@ -78,8 +78,14 @@ typedef long psched_tdiff_t; + #define PSCHED_JSCALE 12 + #elif HZ >= 384 && HZ < 768 + #define PSCHED_JSCALE 11 +-#elif HZ >= 768 ++#elif HZ >= 768 && HZ < 1536 + #define PSCHED_JSCALE 10 ++#elif HZ >= 1536 && HZ < 3072 ++#define PSCHED_JSCALE 9 ++#elif HZ >= 3072 && HZ < 6144 ++#define PSCHED_JSCALE 8 ++#else ++#define PSCHED_JSCALE 7 + #endif + + #define PSCHED_GET_TIME(stamp) ((stamp) = (get_jiffies_64()<loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); + + return 0; +Index: linux-2.6.21-ck1/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-2.6.21-ck1.orig/arch/i386/kernel/smpboot.c 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/i386/kernel/smpboot.c 2007-05-04 12:24:21.000000000 +1000 +@@ -1134,7 +1134,7 @@ static void __init smp_boot_cpus(unsigne + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), +- (bogosum/(5000/HZ))%100); ++ (bogosum * 10/(50000/HZ))%100); + + Dprintk("Before bogocount - setting activated=1.\n"); + +Index: linux-2.6.21-ck1/include/linux/nfsd/stats.h +=================================================================== +--- linux-2.6.21-ck1.orig/include/linux/nfsd/stats.h 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/include/linux/nfsd/stats.h 2007-05-04 12:24:21.000000000 +1000 +@@ -35,8 +35,8 @@ struct nfsd_stats { + + }; + +-/* thread usage wraps very million seconds (approx one fortnight) */ +-#define NFSD_USAGE_WRAP (HZ*1000000) ++/* thread usage wraps every one hundred thousand seconds (approx one day) */ ++#define NFSD_USAGE_WRAP (HZ*100000) + + #ifdef __KERNEL__ + +Index: linux-2.6.21-ck1/arch/x86_64/kernel/setup.c +=================================================================== +--- linux-2.6.21-ck1.orig/arch/x86_64/kernel/setup.c 2007-05-04 12:24:00.000000000 +1000 ++++ linux-2.6.21-ck1/arch/x86_64/kernel/setup.c 2007-05-04 12:24:22.000000000 +1000 +@@ -1053,7 +1053,7 @@ static int show_cpuinfo(struct seq_file + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); diff --git a/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 new file mode 100644 index 00000000000..81fa14e2abe --- /dev/null +++ b/pkgs/os-specific/linux/kernel/patch-2.6.22-ck1 @@ -0,0 +1,5167 @@ +Index: linux-2.6.22-ck1/include/linux/sched.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/sched.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/sched.h 2007-07-10 14:55:21.000000000 +1000 +@@ -34,9 +34,14 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO 5 + + #ifdef __KERNEL__ + ++#define SCHED_MAX SCHED_IDLEPRIO ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++ + struct sched_param { + int sched_priority; + }; +@@ -129,7 +134,7 @@ + extern unsigned long nr_active(void); + extern unsigned long nr_iowait(void); + extern unsigned long weighted_cpuload(const int cpu); +- ++extern int above_background_load(void); + + /* + * Task state bitmask. NOTE! These bits are also +@@ -150,8 +155,7 @@ + #define EXIT_ZOMBIE 16 + #define EXIT_DEAD 32 + /* in tsk->state again */ +-#define TASK_NONINTERACTIVE 64 +-#define TASK_DEAD 128 ++#define TASK_DEAD 64 + + #define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +@@ -537,14 +541,19 @@ + + #define MAX_USER_RT_PRIO 100 + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#define PRIO_RANGE (40) ++#define ISO_PRIO (MAX_RT_PRIO - 1) + +-#define MAX_PRIO (MAX_RT_PRIO + 40) ++#define MAX_PRIO (MAX_RT_PRIO + PRIO_RANGE) + +-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_prio(prio) unlikely((prio) < ISO_PRIO) + #define rt_task(p) rt_prio((p)->prio) + #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) +-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) + #define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++#define iso_task(p) unlikely((p)->policy == SCHED_ISO) ++#define idleprio_task(p) unlikely((p)->policy == SCHED_IDLEPRIO) + + /* + * Some day this will be a full-fledged user tracking system.. +@@ -809,13 +818,6 @@ + struct pipe_inode_info; + struct uts_namespace; + +-enum sleep_type { +- SLEEP_NORMAL, +- SLEEP_NONINTERACTIVE, +- SLEEP_INTERACTIVE, +- SLEEP_INTERRUPTED, +-}; +- + struct prio_array; + + struct task_struct { +@@ -835,20 +837,33 @@ + int load_weight; /* for niceness load balancing purposes */ + int prio, static_prio, normal_prio; + struct list_head run_list; ++ /* ++ * This bitmap shows what priorities this task has received quota ++ * from for this major priority rotation on its current runqueue. ++ */ ++ DECLARE_BITMAP(bitmap, PRIO_RANGE + 1); + struct prio_array *array; ++ /* Which major runqueue rotation did this task run */ ++ unsigned long rotation; + + unsigned short ioprio; + #ifdef CONFIG_BLK_DEV_IO_TRACE + unsigned int btrace_seq; + #endif +- unsigned long sleep_avg; + unsigned long long timestamp, last_ran; + unsigned long long sched_time; /* sched_clock time spent running */ +- enum sleep_type sleep_type; + + unsigned int policy; + cpumask_t cpus_allowed; +- unsigned int time_slice, first_time_slice; ++ /* ++ * How much this task is entitled to run at the current priority ++ * before being requeued at a lower priority. ++ */ ++ int time_slice; ++ /* Is this the very first time_slice this task has ever run. */ ++ unsigned int first_time_slice; ++ /* How much this task receives at each priority level */ ++ int quota; + + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + struct sched_info sched_info; +@@ -1013,6 +1028,7 @@ + struct held_lock held_locks[MAX_LOCK_DEPTH]; + unsigned int lockdep_recursion; + #endif ++ unsigned long mutexes_held; + + /* journalling filesystem info */ + void *journal_info; +@@ -1181,9 +1197,11 @@ + #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ + #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ + #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ ++#define PF_ISOREF 0x04000000 /* SCHED_ISO task has used up quota */ + #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ + #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ + #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ ++#define PF_NONSLEEP 0x80000000 /* Waiting on in-kernel activity */ + + /* + * Only the _current_ task can read/write to tsk->flags, but other +@@ -1253,7 +1271,7 @@ + #endif + + extern void set_user_nice(struct task_struct *p, long nice); +-extern int task_prio(const struct task_struct *p); ++extern int task_prio(struct task_struct *p); + extern int task_nice(const struct task_struct *p); + extern int can_nice(const struct task_struct *p, const int nice); + extern int task_curr(const struct task_struct *p); +Index: linux-2.6.22-ck1/kernel/sched.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/sched.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/sched.c 2007-07-10 14:55:24.000000000 +1000 +@@ -16,6 +16,7 @@ + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-03-02 Staircase deadline scheduling policy by Con Kolivas + */ + + #include +@@ -53,8 +54,9 @@ + #include + #include + #include +- ++#include + #include ++ + #include + + /* +@@ -84,147 +86,85 @@ + #define USER_PRIO(p) ((p)-MAX_RT_PRIO) + #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) + #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) ++#define SCHED_PRIO(p) ((p)+MAX_RT_PRIO) + +-/* +- * Some helpers for converting nanosecond timing to jiffy resolution +- */ +-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) ++/* Some helpers for converting to/from various scales.*/ + #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +- +-/* +- * These are the 'tuning knobs' of the scheduler: +- * +- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), +- * default timeslice is 100 msecs, maximum timeslice is 800 msecs. +- * Timeslices get refilled after they expire. +- */ +-#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +-#define DEF_TIMESLICE (100 * HZ / 1000) +-#define ON_RUNQUEUE_WEIGHT 30 +-#define CHILD_PENALTY 95 +-#define PARENT_PENALTY 100 +-#define EXIT_WEIGHT 3 +-#define PRIO_BONUS_RATIO 25 +-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +-#define INTERACTIVE_DELTA 2 +-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +-#define STARVATION_LIMIT (MAX_SLEEP_AVG) +-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +- +-/* +- * If a task is 'interactive' then we reinsert it in the active +- * array after it has expired its current timeslice. (it will not +- * continue to run immediately, it will still roundrobin with +- * other interactive tasks.) +- * +- * This part scales the interactivity limit depending on niceness. +- * +- * We scale it linearly, offset by the INTERACTIVE_DELTA delta. +- * Here are a few examples of different nice levels: +- * +- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] +- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] +- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] +- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] +- * +- * (the X axis represents the possible -5 ... 0 ... +5 dynamic +- * priority range a task can explore, a value of '1' means the +- * task is rated interactive.) +- * +- * Ie. nice +19 tasks can never get 'interactive' enough to be +- * reinserted into the active array. And only heavily CPU-hog nice -20 +- * tasks will be expired. Default nice 0 tasks are somewhere between, +- * it takes some effort for them to get interactive, but it's not +- * too hard. +- */ +- +-#define CURRENT_BONUS(p) \ +- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ +- MAX_SLEEP_AVG) +- +-#define GRANULARITY (10 * HZ / 1000 ? : 1) +- +-#ifdef CONFIG_SMP +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ +- num_online_cpus()) +-#else +-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ +- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +-#endif +- +-#define SCALE(v1,v1_max,v2_max) \ +- (v1) * (v2_max) / (v1_max) +- +-#define DELTA(p) \ +- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ +- INTERACTIVE_DELTA) +- +-#define TASK_INTERACTIVE(p) \ +- ((p)->prio <= (p)->static_prio - DELTA(p)) +- +-#define INTERACTIVE_SLEEP(p) \ +- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ +- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +- +-#define TASK_PREEMPTS_CURR(p, rq) \ +- ((p)->prio < (rq)->curr->prio) +- +-#define SCALE_PRIO(x, prio) \ +- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) +- +-static unsigned int static_prio_timeslice(int static_prio) +-{ +- if (static_prio < NICE_TO_PRIO(0)) +- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); +- else +- return SCALE_PRIO(DEF_TIMESLICE, static_prio); +-} +- +-#ifdef CONFIG_SMP +-/* +- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) +- * Since cpu_power is a 'constant', we can use a reciprocal divide. ++#define MS_TO_NS(TIME) ((TIME) * 1000000) ++#define MS_TO_US(TIME) ((TIME) * 1000) ++#define US_TO_MS(TIME) ((TIME) / 1000) ++ ++#define TASK_PREEMPTS_CURR(p, curr) ((p)->prio < (curr)->prio) ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 10ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run (over ISO_PERIOD seconds) as real time tasks. ++ * sched_iso_period - sysctl which determines the number of seconds over ++ * which cpu usage of SCHED_ISO tasks is averaged to determine if they are ++ * exceeding their allowable bandwidth. ++*/ ++int sched_iso_cpu __read_mostly = 80; ++int sched_iso_period __read_mostly = 5; ++ ++#define ISO_PERIOD ((sched_iso_period * HZ) + 1) ++ ++/* ++ * This contains a bitmap for each dynamic priority level with empty slots ++ * for the valid priorities each different nice level can have. It allows ++ * us to stagger the slots where differing priorities run in a way that ++ * keeps latency differences between different nice levels at a minimum. ++ * The purpose of a pre-generated matrix is for rapid lookup of next slot in ++ * O(1) time without having to recalculate every time priority gets demoted. ++ * All nice levels use priority slot 39 as this allows less niced tasks to ++ * get all priority slots better than that before expiration is forced. ++ * ie, where 0 means a slot for that priority, priority running from left to ++ * right is from prio 0 to prio 39: ++ * nice -20 0000000000000000000000000000000000000000 ++ * nice -10 1000100010001000100010001000100010010000 ++ * nice 0 1010101010101010101010101010101010101010 ++ * nice 5 1011010110110101101101011011010110110110 ++ * nice 10 1110111011101110111011101110111011101110 ++ * nice 15 1111111011111110111111101111111011111110 ++ * nice 19 1111111111111111111111111111111111111110 + */ +-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +-{ +- return reciprocal_divide(load, sg->reciprocal_cpu_power); +-} ++static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)] ++ __read_mostly; + +-/* +- * Each time a sched group cpu_power is changed, +- * we must compute its reciprocal value +- */ +-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +-{ +- sg->__cpu_power += val; +- sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +-} +-#endif ++struct rq; + + /* +- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] +- * to time slice values: [800ms ... 100ms ... 5ms] +- * +- * The higher a thread's priority, the bigger timeslices +- * it gets during one round of execution. But even the lowest +- * priority thread gets MIN_TIMESLICE worth of execution time. ++ * These are the runqueue data structures: + */ ++struct prio_array { ++ /* Tasks queued at each priority */ ++ struct list_head queue[MAX_PRIO + 1]; + +-static inline unsigned int task_timeslice(struct task_struct *p) +-{ +- return static_prio_timeslice(p->static_prio); +-} ++ /* ++ * The bitmap of priorities queued for this array. While the expired ++ * array will never have realtime tasks on it, it is simpler to have ++ * equal sized bitmaps for a cheap array swap. Include 1 bit for ++ * delimiter. ++ */ ++ DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1); + +-/* +- * These are the runqueue data structures: +- */ ++ /* ++ * The best static priority (of the dynamic priority tasks) queued ++ * this array. ++ */ ++ int best_static_prio; + +-struct prio_array { +- unsigned int nr_active; +- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ +- struct list_head queue[MAX_PRIO]; ++#ifdef CONFIG_SMP ++ /* For convenience looks back at rq */ ++ struct rq *rq; ++#endif + }; + + /* +@@ -260,14 +200,28 @@ + */ + unsigned long nr_uninterruptible; + +- unsigned long expired_timestamp; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; +- struct prio_array *active, *expired, arrays[2]; +- int best_expired_prio; ++ ++ struct prio_array *active, *expired, *idleprio, arrays[2]; ++ unsigned long *dyn_bitmap, *exp_bitmap; ++ ++ /* ++ * The current dynamic priority level this runqueue is at per static ++ * priority level. ++ */ ++ int prio_level[PRIO_RANGE]; ++ ++ /* How many times we have rotated the priority queue */ ++ unsigned long prio_rotation; ++ unsigned long iso_ticks; ++ unsigned short iso_refractory; ++ ++ /* Number of idleprio tasks running */ ++ unsigned long nr_idleprio; + atomic_t nr_iowait; + + #ifdef CONFIG_SMP +@@ -606,12 +560,9 @@ + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + /* + * Called when a process is dequeued from the active array and given +- * the cpu. We should note that with the exception of interactive +- * tasks, the expired queue will become the active queue after the active +- * queue is empty, without explicitly dequeuing and requeuing tasks in the +- * expired queue. (Interactive tasks may be requeued directly to the +- * active queue, thus delaying tasks in the expired queue from running; +- * see scheduler_tick()). ++ * the cpu. We should note that the expired queue will become the active ++ * queue after the active queue is empty, without explicitly dequeuing and ++ * requeuing tasks in the expired queue. + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple +@@ -709,71 +660,304 @@ + #define sched_info_switch(t, next) do { } while (0) + #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + ++static int idleprio_suitable(struct task_struct *p) ++{ ++ return (!p->mutexes_held && !freezing(p) && !signal_pending(p) && ++ !(p->flags & (PF_NONSLEEP | PF_EXITING))); ++} ++ ++static int idleprio(const struct task_struct *p) ++{ ++ return (p->prio == MAX_PRIO); ++} ++ ++static inline int task_queued(struct task_struct *task) ++{ ++ return !list_empty(&task->run_list); ++} ++ ++static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq) ++{ ++ __set_bit(p->prio, p->array->prio_bitmap); ++} ++ + /* +- * Adding/removing a task to/from a priority array: ++ * Removing from a runqueue. + */ +-static void dequeue_task(struct task_struct *p, struct prio_array *array) ++static void dequeue_task(struct task_struct *p, struct rq *rq) + { +- array->nr_active--; +- list_del(&p->run_list); +- if (list_empty(array->queue + p->prio)) +- __clear_bit(p->prio, array->bitmap); ++ list_del_init(&p->run_list); ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio--; ++ else if (list_empty(p->array->queue + p->prio)) ++ __clear_bit(p->prio, p->array->prio_bitmap); + } + +-static void enqueue_task(struct task_struct *p, struct prio_array *array) ++static void reset_first_time_slice(struct task_struct *p) + { +- sched_info_queued(p); +- list_add_tail(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; ++ if (unlikely(p->first_time_slice)) ++ p->first_time_slice = 0; ++} ++ ++/* ++ * The task is being queued on a fresh array so it has its entitlement ++ * bitmap cleared. ++ */ ++static void task_new_array(struct task_struct *p, struct rq *rq, ++ struct prio_array *array) ++{ ++ bitmap_zero(p->bitmap, PRIO_RANGE); ++ p->rotation = rq->prio_rotation; ++ p->time_slice = p->quota; + p->array = array; ++ reset_first_time_slice(p); ++} ++ ++/* Find the first slot from the relevant prio_matrix entry */ ++static int first_prio_slot(struct task_struct *p) ++{ ++ if (unlikely(p->policy == SCHED_BATCH)) ++ return p->static_prio; ++ return SCHED_PRIO(find_first_zero_bit( ++ prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE)); + } + + /* +- * Put task to the end of the run list without the overhead of dequeue +- * followed by enqueue. ++ * In sched_interactive mode priority allocation occurs per process per rq ++ * array swap. In !sched_interactive mode all waking tasks must obey the ++ * current prio level of all other tasks running per array swap. + */ +-static void requeue_task(struct task_struct *p, struct prio_array *array) ++static int minprio(struct rq *rq, int uprio) + { +- list_move_tail(&p->run_list, array->queue + p->prio); ++ if (sched_interactive) ++ return MAX_RT_PRIO; ++ return rq->prio_level[uprio]; + } + +-static inline void +-enqueue_task_head(struct task_struct *p, struct prio_array *array) ++/* ++ * Find the first unused slot by this task that is also in its prio_matrix ++ * level. SCHED_BATCH tasks do not use the priority matrix. They only take ++ * priority slots from their static_prio and above. ++ */ ++static int next_entitled_slot(struct task_struct *p, struct rq *rq) + { +- list_add(&p->run_list, array->queue + p->prio); +- __set_bit(p->prio, array->bitmap); +- array->nr_active++; +- p->array = array; ++ int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio); ++ struct prio_array *array = rq->active; ++ DECLARE_BITMAP(tmp, PRIO_RANGE); ++ ++ /* ++ * Go straight to expiration if there are higher priority tasks ++ * already expired. ++ */ ++ if (p->static_prio > rq->expired->best_static_prio) ++ return MAX_PRIO; ++ if (!rq->prio_level[uprio]) ++ rq->prio_level[uprio] = MAX_RT_PRIO; ++ /* ++ * Only priorities equal to the prio_level and above for their ++ * static_prio are acceptable, and only if it's not better than ++ * a queued better static_prio's prio_level. ++ */ ++ if (p->static_prio < array->best_static_prio) { ++ if (likely(p->policy != SCHED_BATCH)) ++ array->best_static_prio = p->static_prio; ++ } else if (p->static_prio == array->best_static_prio) { ++ search_prio = minprio(rq, uprio); ++ } else { ++ int i; ++ ++ search_prio = minprio(rq, uprio); ++ /* A bound O(n) function, worst case n is 40 */ ++ for (i = array->best_static_prio; i <= p->static_prio ; i++) { ++ if (!rq->prio_level[USER_PRIO(i)]) ++ rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO; ++ search_prio = max(search_prio, ++ rq->prio_level[USER_PRIO(i)]); ++ } ++ } ++ if (unlikely(p->policy == SCHED_BATCH)) { ++ search_prio = max(search_prio, p->static_prio); ++ return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++ } ++ bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE); ++ return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE, ++ USER_PRIO(search_prio))); ++} ++ ++static void queue_expired(struct task_struct *p, struct rq *rq) ++{ ++ task_new_array(p, rq, rq->expired); ++ p->prio = p->normal_prio = first_prio_slot(p); ++ if (p->static_prio < rq->expired->best_static_prio) ++ rq->expired->best_static_prio = p->static_prio; ++ reset_first_time_slice(p); + } + ++#ifdef CONFIG_SMP + /* +- * __normal_prio - return the priority that is based on the static +- * priority but is modified by bonuses/penalties. +- * +- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] +- * into the -5 ... 0 ... +5 bonus/penalty range. +- * +- * We use 25% of the full 0...39 priority range so that: +- * +- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. +- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. +- * +- * Both properties are important to certain workloads. ++ * If we're waking up a task that was previously on a different runqueue, ++ * update its data appropriately. Note we may be reading data from src_rq-> ++ * outside of lock, but the occasional inaccurate result should be harmless. + */ ++ static void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++ struct rq *src_rq = p->array->rq; ++ ++ if (src_rq == rq) ++ return; ++ /* ++ * Only need to set p->array when p->rotation == rq->prio_rotation as ++ * they will be set in recalc_task_prio when != rq->prio_rotation. ++ */ ++ if (p->rotation == src_rq->prio_rotation) { ++ p->rotation = rq->prio_rotation; ++ if (p->array == src_rq->expired) ++ p->array = rq->expired; ++ else ++ p->array = rq->active; ++ } else ++ p->rotation = 0; ++} ++#else ++static inline void update_if_moved(struct task_struct *p, struct rq *rq) ++{ ++} ++#endif + +-static inline int __normal_prio(struct task_struct *p) ++static inline int isoprio_suitable(struct task_struct *p) + { +- int bonus, prio; ++ return !(p->flags & PF_ISOREF); ++} + +- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; ++static int task_timeslice(struct task_struct *p); + +- prio = p->static_prio - bonus; +- if (prio < MAX_RT_PRIO) +- prio = MAX_RT_PRIO; +- if (prio > MAX_PRIO-1) +- prio = MAX_PRIO-1; +- return prio; ++/* ++ * recalc_task_prio determines what priority a non rt_task will be ++ * queued at. If the task has already been running during this runqueue's ++ * major rotation (rq->prio_rotation) then it continues at the same ++ * priority if it has tick entitlement left. If it does not have entitlement ++ * left, it finds the next priority slot according to its nice value that it ++ * has not extracted quota from. If it has not run during this major ++ * rotation, it starts at the next_entitled_slot and has its bitmap quota ++ * cleared. If it does not have any slots left it has all its slots reset and ++ * is queued on the expired at its first_prio_slot. ++ */ ++static void recalc_task_prio(struct task_struct *p, struct rq *rq) ++{ ++ struct prio_array *array = rq->active; ++ int queue_prio; ++ ++ if (iso_task(p)) { ++ if (isoprio_suitable(p)) { ++ /* ++ * If SCHED_ISO tasks have not used up their real time ++ * quota they have run just better than highest ++ * SCHED_NORMAL priority. Otherwise they run as ++ * SCHED_NORMAL. ++ */ ++ p->prio = p->normal_prio = ISO_PRIO; ++ p->array = rq->active; ++ if (p->time_slice <= 0) ++ p->time_slice = p->quota; ++ return; ++ } else if (p->prio == ISO_PRIO) { ++ /* Just about to be demoted to SCHED_NORMAL */ ++ p->time_slice = 0; ++ } ++ } else if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ /* ++ * If suitable idleprio_tasks are queued at MAX_PRIO ++ * only on the idleprio array. Their time_slice is ++ * their full task_timeslice as they cooperatively ++ * multitask. ++ */ ++ p->prio = p->normal_prio = MAX_PRIO; ++ p->array = rq->idleprio; ++ if (p->time_slice <= 0) ++ p->time_slice = task_timeslice(p); ++ return; ++ } ++ /* ++ * If unsuitable idleprio_tasks are queued equivalent to ++ * nice 19 tasks on the expired array. ++ */ ++ p->flags &= ~PF_NONSLEEP; ++ p->prio = p->normal_prio = MAX_PRIO - 1; ++ p->array = rq->expired; ++ if (p->time_slice <= 0 || p->time_slice > p->quota) ++ p->time_slice = p->quota; ++ return; ++ } ++ ++ update_if_moved(p, rq); ++ if (p->rotation == rq->prio_rotation) { ++ if (p->array == array) { ++ if (p->time_slice > 0) ++ return; ++ p->time_slice = p->quota; ++ } else if (p->array == rq->expired) { ++ queue_expired(p, rq); ++ return; ++ } else ++ task_new_array(p, rq, array); ++ } else ++ task_new_array(p, rq, array); ++ ++ queue_prio = next_entitled_slot(p, rq); ++ if (queue_prio >= MAX_PRIO) { ++ queue_expired(p, rq); ++ return; ++ } ++ p->prio = p->normal_prio = queue_prio; ++ __set_bit(USER_PRIO(p->prio), p->bitmap); ++} ++ ++/* ++ * Adding to a runqueue. The dynamic priority queue that it is added to is ++ * determined by recalc_task_prio() above. ++ */ ++static inline void __enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ if (rt_task(p)) ++ p->array = rq->active; ++ else ++ recalc_task_prio(p, rq); ++ ++ if (idleprio_task(p) && idleprio(p)) ++ rq->nr_idleprio++; ++ sched_info_queued(p); ++ set_dynamic_bit(p, rq); ++} ++ ++static void enqueue_task(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add_tail(&p->run_list, p->array->queue + p->prio); ++} ++ ++static inline void enqueue_task_head(struct task_struct *p, struct rq *rq) ++{ ++ __enqueue_task(p, rq); ++ list_add(&p->run_list, p->array->queue + p->prio); ++} ++ ++/* ++ * requeue_task is only called when p->static_prio does not change. p->prio ++ * can change with dynamic tasks. ++ */ ++static void requeue_task(struct task_struct *p, struct rq *rq, ++ struct prio_array *old_array, int old_prio) ++{ ++ if (p->array == rq->expired) ++ queue_expired(p, rq); ++ list_move_tail(&p->run_list, p->array->queue + p->prio); ++ if (!rt_task(p)) { ++ if (list_empty(old_array->queue + old_prio)) ++ __clear_bit(old_prio, old_array->prio_bitmap); ++ set_dynamic_bit(p, rq); ++ } + } + + /* +@@ -786,20 +970,29 @@ + */ + + /* +- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE +- * If static_prio_timeslice() is ever changed to break this assumption then +- * this code will need modification +- */ +-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +-#define LOAD_WEIGHT(lp) \ +- (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +-#define PRIO_TO_LOAD_WEIGHT(prio) \ +- LOAD_WEIGHT(static_prio_timeslice(prio)) +-#define RTPRIO_TO_LOAD_WEIGHT(rp) \ +- (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) ++ * task_timeslice - the total duration a task can run during one major ++ * rotation. Returns value in milliseconds as the smallest value can be 1. ++ */ ++static int task_timeslice(struct task_struct *p) ++{ ++ int slice = p->quota; /* quota is in us */ ++ ++ if (!rt_task(p)) ++ slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice; ++ return US_TO_MS(slice); ++} ++ ++/* ++ * The load weight is basically the task_timeslice in ms. Realtime tasks are ++ * special cased to be proportionately larger than nice -20 by their ++ * rt_priority. The weight for rt tasks can only be arbitrary at best. ++ */ ++#define RTPRIO_TO_LOAD_WEIGHT(rp) (rr_interval * 20 * (40 + rp)) + + static void set_load_weight(struct task_struct *p) + { ++ int load_weight; ++ + if (has_rt_policy(p)) { + #ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) +@@ -808,12 +1001,19 @@ + * Giving its load any weight will skew balancing + * adversely. + */ +- p->load_weight = 0; ++ load_weight = 0; + else + #endif +- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); ++ load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else +- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); ++ load_weight = task_timeslice(p); ++ /* ++ * idleprio tasks have much lower weight than SCHED_NORMAL tasks but ++ * still need to be weighted to allow balancing to occur. ++ */ ++ if (likely(!idleprio_task(p))) ++ load_weight *= PRIO_RANGE; ++ p->load_weight = load_weight; + } + + static inline void +@@ -841,28 +1041,38 @@ + } + + /* +- * Calculate the expected normal priority: i.e. priority +- * without taking RT-inheritance into account. Might be +- * boosted by interactivity modifiers. Changes upon fork, +- * setprio syscalls, and whenever the interactivity +- * estimator recalculates. ++ * __activate_task - move a task to the runqueue. + */ +-static inline int normal_prio(struct task_struct *p) ++static inline void __activate_task(struct task_struct *p, struct rq *rq) + { +- int prio; ++ enqueue_task(p, rq); ++ inc_nr_running(p, rq); ++} + ++/* ++ * __activate_idle_task - move idle task to the _front_ of runqueue. ++ */ ++static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task_head(p, rq); ++ inc_nr_running(p, rq); ++} ++ ++static inline int normal_prio(struct task_struct *p) ++{ + if (has_rt_policy(p)) +- prio = MAX_RT_PRIO-1 - p->rt_priority; ++ return MAX_RT_PRIO-1 - p->rt_priority; ++ /* Other tasks all have normal_prio set in recalc_task_prio */ ++ if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO)) ++ return p->prio; + else +- prio = __normal_prio(p); +- return prio; ++ return p->static_prio; + } + + /* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might +- * be boosted by RT tasks, or might be boosted by +- * interactivity modifiers. Will be RT if the task got ++ * be boosted by RT tasks as it will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ + static int effective_prio(struct task_struct *p) +@@ -878,112 +1088,70 @@ + return p->prio; + } + +-/* +- * __activate_task - move a task to the runqueue. +- */ +-static void __activate_task(struct task_struct *p, struct rq *rq) ++static inline unsigned int nice_quota_ms(int nice) + { +- struct prio_array *target = rq->active; ++ unsigned int rr = rr_interval; + +- if (batch_task(p)) +- target = rq->expired; +- enqueue_task(p, target); +- inc_nr_running(p, rq); ++ if (nice < -6) { ++ rr *= nice * nice; ++ rr /= 40; ++ } else if (nice > 0) ++ rr = rr / 2 ? : 1; ++ return rr; + } + ++#define DEFAULT_WEIGHT (nice_quota_ms(0) * 20 * PRIO_RANGE) ++ + /* +- * __activate_idle_task - move idle task to the _front_ of runqueue. ++ * A runqueue laden with a single nice 0 task scores a weighted_cpuload of ++ * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a ++ * task of nice 0 or enough lower priority tasks to bring up the ++ * weighted_cpuload + */ +-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) ++int above_background_load(void) + { +- enqueue_task_head(p, rq->active); +- inc_nr_running(p, rq); ++ unsigned long cpu; ++ ++ for_each_online_cpu(cpu) { ++ if (weighted_cpuload(cpu) >= DEFAULT_WEIGHT) ++ return 1; ++ } ++ return 0; + } + + /* +- * Recalculate p->normal_prio and p->prio after having slept, +- * updating the sleep-average too: ++ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval. ++ * From nice 1 to 19 they are smaller than it only if they are at least one ++ * tick still. Below nice 0 they get progressively larger. ++ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval ++ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2. ++ * Value returned is in microseconds. + */ +-static int recalc_task_prio(struct task_struct *p, unsigned long long now) ++static inline unsigned int rr_quota(struct task_struct *p) + { +- /* Caller must always ensure 'now >= p->timestamp' */ +- unsigned long sleep_time = now - p->timestamp; ++ unsigned int quota; + +- if (batch_task(p)) +- sleep_time = 0; +- +- if (likely(sleep_time > 0)) { +- /* +- * This ceiling is set to the lowest priority that would allow +- * a task to be reinserted into the active array on timeslice +- * completion. +- */ +- unsigned long ceiling = INTERACTIVE_SLEEP(p); +- +- if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { +- /* +- * Prevents user tasks from achieving best priority +- * with one single large enough sleep. +- */ +- p->sleep_avg = ceiling; +- /* +- * Using INTERACTIVE_SLEEP() as a ceiling places a +- * nice(0) task 1ms sleep away from promotion, and +- * gives it 700ms to round-robin with no chance of +- * being demoted. This is more than generous, so +- * mark this sleep as non-interactive to prevent the +- * on-runqueue bonus logic from intervening should +- * this task not receive cpu immediately. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else { +- /* +- * Tasks waking from uninterruptible sleep are +- * limited in their sleep_avg rise as they +- * are likely to be waiting on I/O +- */ +- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { +- if (p->sleep_avg >= ceiling) +- sleep_time = 0; +- else if (p->sleep_avg + sleep_time >= +- ceiling) { +- p->sleep_avg = ceiling; +- sleep_time = 0; +- } +- } +- +- /* +- * This code gives a bonus to interactive tasks. +- * +- * The boost works by updating the 'average sleep time' +- * value here, based on ->timestamp. The more time a +- * task spends sleeping, the higher the average gets - +- * and the higher the priority boost gets as well. +- */ +- p->sleep_avg += sleep_time; +- +- } +- if (p->sleep_avg > NS_MAX_SLEEP_AVG) +- p->sleep_avg = NS_MAX_SLEEP_AVG; +- } ++ if (rt_task(p)) ++ quota = rr_interval; ++ else ++ quota = nice_quota_ms(TASK_NICE(p)); ++ return MS_TO_US(quota); ++} + +- return effective_prio(p); ++/* Every time we set the quota we need to set the load weight */ ++static void set_quota(struct task_struct *p) ++{ ++ p->quota = rr_quota(p); ++ set_load_weight(p); + } + + /* + * activate_task - move a task to the runqueue and do priority recalculation +- * +- * Update all the scheduling statistics stuff. (sleep average +- * calculation, priority modifiers, etc.) + */ + static void activate_task(struct task_struct *p, struct rq *rq, int local) + { +- unsigned long long now; +- +- if (rt_task(p)) +- goto out; ++ unsigned long long now = sched_clock(); + +- now = sched_clock(); + #ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ +@@ -1004,32 +1172,9 @@ + (now - p->timestamp) >> 20); + } + +- p->prio = recalc_task_prio(p, now); +- +- /* +- * This checks to make sure it's not an uninterruptible task +- * that is now waking up. +- */ +- if (p->sleep_type == SLEEP_NORMAL) { +- /* +- * Tasks which were woken up by interrupts (ie. hw events) +- * are most likely of interactive nature. So we give them +- * the credit of extending their sleep time to the period +- * of time they spend on the runqueue, waiting for execution +- * on a CPU, first time around: +- */ +- if (in_interrupt()) +- p->sleep_type = SLEEP_INTERRUPTED; +- else { +- /* +- * Normal first-time wakeups get a credit too for +- * on-runqueue time, but it will be weighted down: +- */ +- p->sleep_type = SLEEP_INTERACTIVE; +- } +- } ++ set_quota(p); ++ p->prio = effective_prio(p); + p->timestamp = now; +-out: + __activate_task(p, rq); + } + +@@ -1039,8 +1184,7 @@ + static void deactivate_task(struct task_struct *p, struct rq *rq) + { + dec_nr_running(p, rq); +- dequeue_task(p, p->array); +- p->array = NULL; ++ dequeue_task(p, rq); + } + + /* +@@ -1133,7 +1277,7 @@ + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ +- if (!p->array && !task_running(rq, p)) { ++ if (!task_queued(p) && !task_running(rq, p)) { + set_task_cpu(p, dest_cpu); + return 0; + } +@@ -1159,7 +1303,6 @@ + { + unsigned long flags; + struct rq *rq; +- struct prio_array *array; + int running; + + repeat: +@@ -1192,7 +1335,6 @@ + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); +- array = p->array; + task_rq_unlock(rq, &flags); + + /* +@@ -1215,7 +1357,7 @@ + * running right now), it's preempted, and we should + * yield - it could be a while. + */ +- if (unlikely(array)) { ++ if (unlikely(task_queued(p))) { + yield(); + goto repeat; + } +@@ -1294,6 +1436,25 @@ + } + + /* ++ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) ++ * Since cpu_power is a 'constant', we can use a reciprocal divide. ++ */ ++static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) ++{ ++ return reciprocal_divide(load, sg->reciprocal_cpu_power); ++} ++ ++/* ++ * Each time a sched group cpu_power is changed, ++ * we must compute its reciprocal value ++ */ ++static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) ++{ ++ sg->__cpu_power += val; ++ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); ++} ++ ++/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +@@ -1490,6 +1651,31 @@ + } + #endif + ++/* ++ * We need to have a special definition for an idle runqueue when testing ++ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as ++ * a realtime task in sched_idle_next. ++ */ ++#ifdef CONFIG_HOTPLUG_CPU ++#define rq_idle(rq) ((rq)->curr == (rq)->idle && !rt_task((rq)->curr)) ++#else ++#define rq_idle(rq) ((rq)->curr == (rq)->idle) ++#endif ++ ++static inline int task_preempts_curr(struct task_struct *p, struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ return ((p->array == task_rq(p)->active && ++ TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq)); ++} ++ ++static inline void try_preempt(struct task_struct *p, struct rq *rq) ++{ ++ if (task_preempts_curr(p, rq)) ++ resched_task(rq->curr); ++} ++ + /*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread +@@ -1521,7 +1707,7 @@ + if (!(old_state & state)) + goto out; + +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + cpu = task_cpu(p); +@@ -1614,7 +1800,7 @@ + old_state = p->state; + if (!(old_state & state)) + goto out; +- if (p->array) ++ if (task_queued(p)) + goto out_running; + + this_cpu = smp_processor_id(); +@@ -1623,25 +1809,9 @@ + + out_activate: + #endif /* CONFIG_SMP */ +- if (old_state == TASK_UNINTERRUPTIBLE) { ++ if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; +- /* +- * Tasks on involuntary sleep don't earn +- * sleep_avg beyond just interactive state. +- */ +- p->sleep_type = SLEEP_NONINTERACTIVE; +- } else +- +- /* +- * Tasks that have marked their sleep as noninteractive get +- * woken up with their sleep average not weighted in an +- * interactive way. +- */ +- if (old_state & TASK_NONINTERACTIVE) +- p->sleep_type = SLEEP_NONINTERACTIVE; +- + +- activate_task(p, rq, cpu == this_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) +@@ -1650,15 +1820,22 @@ + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ +- if (!sync || cpu != this_cpu) { +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); +- } ++ activate_task(p, rq, cpu == this_cpu); ++ if (!sync || cpu != this_cpu) ++ try_preempt(p, rq); + success = 1; + + out_running: + p->state = TASK_RUNNING; + out: ++ /* ++ * Special case when freezing we need to reschedule idleprio tasks ++ * as SCHED_NORMAL or else they'll never freeze ++ */ ++ if (idleprio_task(p) && freezing(p) && idleprio(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ } + task_rq_unlock(rq, &flags); + + return success; +@@ -1676,7 +1853,6 @@ + return try_to_wake_up(p, state, 0); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p); + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -1704,7 +1880,6 @@ + p->prio = current->normal_prio; + + INIT_LIST_HEAD(&p->run_list); +- p->array = NULL; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +@@ -1716,30 +1891,31 @@ + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; + #endif ++ if (unlikely(p->policy == SCHED_FIFO)) ++ goto out; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); +- p->time_slice = (current->time_slice + 1) >> 1; +- /* +- * The remainder of the first timeslice might be recovered by +- * the parent if the child exits early enough. +- */ +- p->first_time_slice = 1; +- current->time_slice >>= 1; +- p->timestamp = sched_clock(); +- if (unlikely(!current->time_slice)) { ++ if (current->time_slice > 0) { ++ current->time_slice /= 2; ++ if (current->time_slice) ++ p->time_slice = current->time_slice; ++ else ++ p->time_slice = 1; + /* +- * This case is rare, it happens when the parent has only +- * a single jiffy left from its timeslice. Taking the +- * runqueue lock is not a problem. ++ * The remainder of the first timeslice might be recovered by ++ * the parent if the child exits early enough. + */ +- current->time_slice = 1; +- task_running_tick(cpu_rq(cpu), current); +- } ++ p->first_time_slice = 1; ++ } else ++ p->time_slice = 0; ++ ++ p->timestamp = sched_clock(); + local_irq_enable(); ++out: + put_cpu(); + } + +@@ -1761,38 +1937,16 @@ + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + +- /* +- * We decrease the sleep average of forking parents +- * and children as well, to keep max-interactive tasks +- * from forking tasks that are max-interactive. The parent +- * (current) is done further down, under its lock. +- */ +- p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * +- CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); +- +- p->prio = effective_prio(p); +- + if (likely(cpu == this_cpu)) { ++ activate_task(p, rq, 1); + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ +- if (unlikely(!current->array)) +- __activate_task(p, rq); +- else { +- p->prio = current->prio; +- p->normal_prio = current->normal_prio; +- list_add_tail(&p->run_list, ¤t->run_list); +- p->array = current->array; +- p->array->nr_active++; +- inc_nr_running(p, rq); +- } + set_need_resched(); +- } else +- /* Run child last */ +- __activate_task(p, rq); ++ } + /* + * We skip the following code due to cpu == this_cpu + * +@@ -1809,19 +1963,16 @@ + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; +- __activate_task(p, rq); +- if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ activate_task(p, rq, 0); ++ try_preempt(p, rq); + + /* + * Parent and child are on different CPUs, now get the +- * parent runqueue to update the parent's ->sleep_avg: ++ * parent runqueue to update the parent's ->flags: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } +- current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * +- PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); + } + +@@ -1836,23 +1987,17 @@ + */ + void fastcall sched_exit(struct task_struct *p) + { ++ struct task_struct *parent; + unsigned long flags; + struct rq *rq; + +- /* +- * If the child was a (relative-) CPU hog then decrease +- * the sleep_avg of the parent as well. +- */ +- rq = task_rq_lock(p->parent, &flags); +- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { +- p->parent->time_slice += p->time_slice; +- if (unlikely(p->parent->time_slice > task_timeslice(p))) +- p->parent->time_slice = task_timeslice(p); +- } +- if (p->sleep_avg < p->parent->sleep_avg) +- p->parent->sleep_avg = p->parent->sleep_avg / +- (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / +- (EXIT_WEIGHT + 1); ++ parent = p->parent; ++ rq = task_rq_lock(parent, &flags); ++ if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) { ++ parent->time_slice += p->time_slice; ++ if (unlikely(parent->time_slice > parent->quota)) ++ parent->time_slice = parent->quota; ++ } + task_rq_unlock(rq, &flags); + } + +@@ -2184,23 +2329,17 @@ + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +-static void pull_task(struct rq *src_rq, struct prio_array *src_array, +- struct task_struct *p, struct rq *this_rq, +- struct prio_array *this_array, int this_cpu) ++static void pull_task(struct rq *src_rq, struct task_struct *p, ++ struct rq *this_rq, int this_cpu) + { +- dequeue_task(p, src_array); ++ dequeue_task(p, src_rq); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); +- enqueue_task(p, this_array); ++ enqueue_task(p, this_rq); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; +- /* +- * Note that idle threads have a prio of MAX_PRIO, for this test +- * to be always true for them. +- */ +- if (TASK_PREEMPTS_CURR(p, this_rq)) +- resched_task(this_rq->curr); ++ try_preempt(p, this_rq); + } + + /* +@@ -2243,7 +2382,16 @@ + return 1; + } + +-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) ++static inline int rq_best_prio(struct rq *rq) ++{ ++ int best_prio, exp_prio; ++ ++ best_prio = sched_find_first_bit(rq->dyn_bitmap); ++ exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ if (unlikely(best_prio > exp_prio)) ++ best_prio = exp_prio; ++ return best_prio; ++} + + /* + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted +@@ -2259,7 +2407,7 @@ + { + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; +- struct prio_array *array, *dst_array; ++ struct prio_array *array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; +@@ -2286,31 +2434,29 @@ + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ +- if (busiest->expired->nr_active) { +- array = busiest->expired; +- dst_array = this_rq->expired; +- } else { +- array = busiest->active; +- dst_array = this_rq->active; +- } +- ++ array = busiest->expired; + new_array: +- /* Start searching at priority 0: */ +- idx = 0; ++ /* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */ ++ if (array == busiest->expired) ++ idx = MAX_RT_PRIO; ++ else ++ idx = 0; + skip_bitmap: + if (!idx) +- idx = sched_find_first_bit(array->bitmap); ++ idx = sched_find_first_bit(array->prio_bitmap); + else +- idx = find_next_bit(array->bitmap, MAX_PRIO, idx); +- if (idx >= MAX_PRIO) { +- if (array == busiest->expired && busiest->active->nr_active) { ++ idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx); ++ if (idx == MAX_PRIO) { ++ if (array == busiest->idleprio && busiest->nr_idleprio) ++ goto found_idleprio; ++ if (array == busiest->expired) { + array = busiest->active; +- dst_array = this_rq->active; + goto new_array; + } + goto out; + } + ++found_idleprio: + head = array->queue + idx; + curr = head->prev; + skip_queue: +@@ -2332,11 +2478,22 @@ + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ /* ++ * Occurs either when balancing idleprio tasks or ++ * there really are no more tasks to find. ++ */ ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } + +- pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); ++ pull_task(busiest, tmp, this_rq, this_cpu); + pulled++; + rem_load_move -= tmp->load_weight; + +@@ -2349,6 +2506,13 @@ + this_best_prio = idx; + if (curr != head) + goto skip_queue; ++ if (idx == MAX_PRIO) { ++ if (array == busiest->expired) { ++ array = busiest->active; ++ goto new_array; ++ } ++ goto out; ++ } + idx++; + goto skip_bitmap; + } +@@ -3297,11 +3461,36 @@ + /* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ * The value returned from sched_clock() occasionally gives bogus values so ++ * some sanity checking is required. + */ +-static inline void +-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) ++static void ++update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now, ++ int tick) + { +- p->sched_time += now - p->last_ran; ++ long time_diff = now - p->last_ran; ++ ++ if (tick) { ++ /* ++ * Called from scheduler_tick() there should be less than two ++ * jiffies worth, and not negative/overflow. ++ */ ++ if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0) ++ time_diff = JIFFIES_TO_NS(1); ++ } else { ++ /* ++ * Called from context_switch there should be less than one ++ * jiffy worth, and not negative/overflow. There should be ++ * some time banked here so use a nominal 1us. ++ */ ++ if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1) ++ time_diff = 1000; ++ } ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p != rq->idle && p->policy != SCHED_FIFO) ++ p->time_slice -= time_diff / 1000; ++ p->sched_time += time_diff; + p->last_ran = rq->most_recent_timestamp = now; + } + +@@ -3322,27 +3511,6 @@ + } + + /* +- * We place interactive tasks back into the active array, if possible. +- * +- * To guarantee that this does not starve expired tasks we ignore the +- * interactivity of a task if the first expired task had to wait more +- * than a 'reasonable' amount of time. This deadline timeout is +- * load-dependent, as the frequency of array switched decreases with +- * increasing number of running tasks. We also ignore the interactivity +- * if a better static_prio task has expired: +- */ +-static inline int expired_starving(struct rq *rq) +-{ +- if (rq->curr->static_prio > rq->best_expired_prio) +- return 1; +- if (!STARVATION_LIMIT || !rq->expired_timestamp) +- return 0; +- if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) +- return 1; +- return 0; +-} +- +-/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() +@@ -3357,7 +3525,7 @@ + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0 || idleprio_task(p)) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); +@@ -3415,87 +3583,94 @@ + cpustat->steal = cputime64_add(cpustat->steal, tmp); + } + +-static void task_running_tick(struct rq *rq, struct task_struct *p) ++/* ++ * The task has used up its quota of running in this prio_level so it must be ++ * dropped a priority level, all managed by recalc_task_prio(). ++ */ ++static void task_expired_entitlement(struct rq *rq, struct task_struct *p) + { +- if (p->array != rq->active) { +- /* Task has expired but was not scheduled yet */ +- set_tsk_need_resched(p); ++ int overrun; ++ ++ reset_first_time_slice(p); ++ if (rt_task(p)) { ++ p->time_slice += p->quota; ++ list_move_tail(&p->run_list, p->array->queue + p->prio); + return; + } +- spin_lock(&rq->lock); ++ overrun = p->time_slice; ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); + /* +- * The task was running during this tick - update the +- * time slice counter. Note: we do not update a thread's +- * priority until it either goes to sleep or uses up its +- * timeslice. This makes it possible for interactive tasks +- * to use up their timeslices at their highest priority levels. ++ * Subtract any extra time this task ran over its time_slice; ie ++ * overrun will either be 0 or negative. + */ +- if (rt_task(p)) { +- /* +- * RR tasks need a special form of timeslice management. +- * FIFO tasks have no timeslices. +- */ +- if ((p->policy == SCHED_RR) && !--p->time_slice) { +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; +- set_tsk_need_resched(p); ++ p->time_slice += overrun; ++} + +- /* put it at the end of the queue: */ +- requeue_task(p, rq->active); +- } +- goto out_unlock; ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. ++ */ ++static unsigned int test_ret_isorefractory(struct rq *rq) ++{ ++ if (likely(!rq->iso_refractory)) { ++ if (rq->iso_ticks / ISO_PERIOD > sched_iso_cpu) ++ rq->iso_refractory = 1; ++ } else { ++ if (rq->iso_ticks / ISO_PERIOD < (sched_iso_cpu * 90 / 100)) ++ rq->iso_refractory = 0; + } +- if (!--p->time_slice) { +- dequeue_task(p, rq->active); +- set_tsk_need_resched(p); +- p->prio = effective_prio(p); +- p->time_slice = task_timeslice(p); +- p->first_time_slice = 0; ++ return rq->iso_refractory; ++} + +- if (!rq->expired_timestamp) +- rq->expired_timestamp = jiffies; +- if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { +- enqueue_task(p, rq->expired); +- if (p->static_prio < rq->best_expired_prio) +- rq->best_expired_prio = p->static_prio; +- } else +- enqueue_task(p, rq->active); +- } else { +- /* +- * Prevent a too long timeslice allowing a task to monopolize +- * the CPU. We do this by splitting up the timeslice into +- * smaller pieces. +- * +- * Note: this does not mean the task's timeslices expire or +- * get lost in any way, they just might be preempted by +- * another task of equal priority. (one with higher +- * priority would have preempted this task already.) We +- * requeue this task to the end of the list on this priority +- * level, which is in essence a round-robin of tasks with +- * equal priority. +- * +- * This only applies to tasks in the interactive +- * delta range with at least TIMESLICE_GRANULARITY to requeue. +- */ +- if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - +- p->time_slice) % TIMESLICE_GRANULARITY(p)) && +- (p->time_slice >= TIMESLICE_GRANULARITY(p)) && +- (p->array == rq->active)) { ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++} + +- requeue_task(p, rq->active); +- set_tsk_need_resched(p); +- } ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if ((rt_task(p) || (iso_task(p) && !rq->iso_refractory))) { ++ if (rq->iso_ticks <= (ISO_PERIOD * 100) - 100) ++ rq->iso_ticks += 100; ++ } else ++ no_iso_tick(rq); ++ ++ if (iso_task(p)) { ++ if (unlikely(test_ret_isorefractory(rq))) { ++ if (isoprio_suitable(p)) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Set the PF_ISOREF flag and ++ * force it to reschedule as SCHED_NORMAL ++ * by zeroing its time_slice ++ */ ++ p->flags |= PF_ISOREF; ++ p->time_slice = 0; ++ } ++ } else ++ p->flags &= ~PF_ISOREF; + } +-out_unlock: +- spin_unlock(&rq->lock); ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->time_slice > 0 || p->policy == SCHED_FIFO) ++ return; ++ /* p->time_slice <= 0 */ ++ set_tsk_need_resched(p); ++ if (likely(task_queued(p))) ++ task_expired_entitlement(rq, p); + } + + /* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. +- * +- * It also gets called by the fork code, when changing the parent's +- * timeslices. + */ + void scheduler_tick(void) + { +@@ -3505,10 +3680,14 @@ + int idle_at_tick = idle_cpu(cpu); + struct rq *rq = cpu_rq(cpu); + +- update_cpu_clock(p, rq, now); ++ update_cpu_clock(p, rq, now, 1); + ++ spin_lock(&rq->lock); + if (!idle_at_tick) + task_running_tick(rq, p); ++ else ++ no_iso_tick(rq); ++ spin_unlock(&rq->lock); + #ifdef CONFIG_SMP + update_load(rq); + rq->idle_at_tick = idle_at_tick; +@@ -3554,10 +3733,80 @@ + + #endif + +-static inline int interactive_sleep(enum sleep_type sleep_type) ++static void reset_prio_levels(struct rq *rq) + { +- return (sleep_type == SLEEP_INTERACTIVE || +- sleep_type == SLEEP_INTERRUPTED); ++ rq->active->best_static_prio = MAX_PRIO - 1; ++ rq->expired->best_static_prio = MAX_PRIO - 1; ++ memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE); ++} ++ ++/* ++ * Only tasks running are SCHED_IDLEPRIO. Set the active array to the ++ * idleprio array and if it isn't already active ++ */ ++static struct task_struct *next_idleprio_task(struct rq *rq) ++{ ++ struct prio_array *array = rq->active; ++ struct list_head *queue; ++ ++ if (array != rq->idleprio) { ++ rq->active = rq->idleprio; ++ rq->expired = array; ++ array = rq->active; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ } ++ rq->prio_rotation++; ++ reset_prio_levels(rq); ++ queue = array->queue + MAX_PRIO; ++ return list_entry(queue->next, struct task_struct, run_list); ++} ++ ++/* ++ * next_dynamic_task finds the next suitable dynamic task. ++ */ ++static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx) ++{ ++ struct prio_array *array = rq->active; ++ struct task_struct *next; ++ struct list_head *queue; ++ int nstatic; ++ ++retry: ++ if (unlikely(rq->nr_running == rq->nr_idleprio)) ++ return next_idleprio_task(rq); ++ if (idx >= MAX_PRIO) { ++ /* There are no more tasks in the active array. Swap arrays */ ++ array = rq->expired; ++ rq->expired = rq->active; ++ rq->active = array; ++ rq->exp_bitmap = rq->expired->prio_bitmap; ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->prio_rotation++; ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ reset_prio_levels(rq); ++ } ++ queue = array->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); ++ if (unlikely(next->time_slice <= 0 && !(iso_task(next) && ++ isoprio_suitable(next)))) { ++ /* ++ * Unlucky enough that this task ran out of time_slice ++ * before it hit a scheduler_tick so it should have its ++ * priority reassessed and choose another task (possibly ++ * the same one) ++ */ ++ task_expired_entitlement(rq, next); ++ idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO); ++ goto retry; ++ } ++ next->rotation = rq->prio_rotation; ++ nstatic = next->static_prio; ++ if (nstatic < array->best_static_prio) ++ array->best_static_prio = nstatic; ++ if (idx > rq->prio_level[USER_PRIO(nstatic)]) ++ rq->prio_level[USER_PRIO(nstatic)] = idx; ++ return next; + } + + /* +@@ -3566,13 +3815,11 @@ + asmlinkage void __sched schedule(void) + { + struct task_struct *prev, *next; +- struct prio_array *array; + struct list_head *queue; + unsigned long long now; +- unsigned long run_time; +- int cpu, idx, new_prio; + long *switch_count; + struct rq *rq; ++ int cpu, idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into +@@ -3608,18 +3855,6 @@ + + schedstat_inc(rq, sched_cnt); + now = sched_clock(); +- if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { +- run_time = now - prev->timestamp; +- if (unlikely((long long)(now - prev->timestamp) < 0)) +- run_time = 0; +- } else +- run_time = NS_MAX_SLEEP_AVG; +- +- /* +- * Tasks charged proportionately less run_time at high sleep_avg to +- * delay them losing their interactive status +- */ +- run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + +@@ -3630,8 +3865,10 @@ + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { +- if (prev->state == TASK_UNINTERRUPTIBLE) ++ if (prev->state == TASK_UNINTERRUPTIBLE) { ++ prev->flags |= PF_NONSLEEP; + rq->nr_uninterruptible++; ++ } + deactivate_task(prev, rq); + } + } +@@ -3641,59 +3878,29 @@ + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; +- rq->expired_timestamp = 0; + goto switch_tasks; + } + } + +- array = rq->active; +- if (unlikely(!array->nr_active)) { +- /* +- * Switch the active and expired arrays. +- */ +- schedstat_inc(rq, sched_switch); +- rq->active = rq->expired; +- rq->expired = array; +- array = rq->active; +- rq->expired_timestamp = 0; +- rq->best_expired_prio = MAX_PRIO; +- } +- +- idx = sched_find_first_bit(array->bitmap); +- queue = array->queue + idx; +- next = list_entry(queue->next, struct task_struct, run_list); +- +- if (!rt_task(next) && interactive_sleep(next->sleep_type)) { +- unsigned long long delta = now - next->timestamp; +- if (unlikely((long long)(now - next->timestamp) < 0)) +- delta = 0; +- +- if (next->sleep_type == SLEEP_INTERACTIVE) +- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; +- +- array = next->array; +- new_prio = recalc_task_prio(next, next->timestamp + delta); +- +- if (unlikely(next->prio != new_prio)) { +- dequeue_task(next, array); +- next->prio = new_prio; +- enqueue_task(next, array); +- } ++ idx = sched_find_first_bit(rq->dyn_bitmap); ++ if (likely(idx > ISO_PRIO)) ++ next = next_dynamic_task(rq, idx); ++ else { ++ queue = rq->active->queue + idx; ++ next = list_entry(queue->next, struct task_struct, run_list); + } +- next->sleep_type = SLEEP_NORMAL; + switch_tasks: +- if (next == rq->idle) ++ if (next == rq->idle) { ++ reset_prio_levels(rq); ++ rq->prio_rotation++; + schedstat_inc(rq, sched_goidle); ++ } + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + +- update_cpu_clock(prev, rq, now); +- +- prev->sleep_avg -= run_time; +- if ((long)prev->sleep_avg <= 0) +- prev->sleep_avg = 0; ++ update_cpu_clock(prev, rq, now, 0); + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); +@@ -4129,29 +4336,22 @@ + */ + void rt_mutex_setprio(struct task_struct *p, int prio) + { +- struct prio_array *array; + unsigned long flags; ++ int queued, oldprio; + struct rq *rq; +- int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; +- array = p->array; +- if (array) +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) ++ dequeue_task(p, rq); + p->prio = prio; + +- if (array) { +- /* +- * If changing to an RT priority then queue it +- * in the active array! +- */ +- if (rt_task(p)) +- array = rq->active; +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on +@@ -4160,8 +4360,8 @@ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + task_rq_unlock(rq, &flags); + } +@@ -4170,8 +4370,7 @@ + + void set_user_nice(struct task_struct *p, long nice) + { +- struct prio_array *array; +- int old_prio, delta; ++ int queued, old_prio,delta; + unsigned long flags; + struct rq *rq; + +@@ -4192,26 +4391,27 @@ + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } +- array = p->array; +- if (array) { +- dequeue_task(p, array); ++ queued = task_queued(p); ++ if (queued) { ++ dequeue_task(p, rq); + dec_raw_weighted_load(rq, p); + } + + p->static_prio = NICE_TO_PRIO(nice); +- set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); ++ set_quota(p); + delta = p->prio - old_prio; + +- if (array) { +- enqueue_task(p, array); ++ if (queued) { ++ enqueue_task(p, rq); + inc_raw_weighted_load(rq, p); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ +- if (delta < 0 || (delta > 0 && task_running(rq, p))) ++ if (delta < 0 || ((delta > 0 || idleprio_task(p)) && ++ task_running(rq, p))) + resched_task(rq->curr); + } + out_unlock: +@@ -4281,11 +4481,23 @@ + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered +- * around 0, value goes from -16 to +15. ++ * around 1, value goes from 0 to +79. Values higher than ++ * 39 indicate task is on the expired array. This is done ++ * lockless and may rarely return an active instead of ++ * expired value. + */ +-int task_prio(const struct task_struct *p) ++int task_prio(struct task_struct *p) + { +- return p->prio - MAX_RT_PRIO; ++ int prio = p->prio - MAX_RT_PRIO; ++ ++ if (task_queued(p)) { ++ struct rq *rq = task_rq(p); ++ struct prio_array *array = p->array; ++ ++ if (rq && rq->expired == array) ++ prio += PRIO_RANGE; ++ } ++ return prio; + } + + /** +@@ -4328,19 +4540,14 @@ + /* Actually do priority change: must hold rq lock. */ + static void __setscheduler(struct task_struct *p, int policy, int prio) + { +- BUG_ON(p->array); ++ BUG_ON(task_queued(p)); + + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); +- /* +- * SCHED_BATCH tasks are treated as perpetual CPU hogs: +- */ +- if (policy == SCHED_BATCH) +- p->sleep_avg = 0; +- set_load_weight(p); ++ set_quota(p); + } + + /** +@@ -4354,19 +4561,36 @@ + int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) + { +- int retval, oldprio, oldpolicy = -1; +- struct prio_array *array; ++ struct sched_param zero_param = { .sched_priority = 0 }; ++ int queued, retval, oldprio, oldpolicy = -1; ++ unsigned long rlim_rtprio = 0; + unsigned long flags; + struct rq *rq; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ param = &zero_param; ++ } + recheck: + /* double check policy once rq lock held */ + if (policy < 0) + policy = oldpolicy = p->policy; +- else if (policy != SCHED_FIFO && policy != SCHED_RR && +- policy != SCHED_NORMAL && policy != SCHED_BATCH) ++ else if (!SCHED_RANGE(policy)) + return -EINVAL; + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are +@@ -4385,14 +4609,6 @@ + */ + if (!capable(CAP_SYS_NICE)) { + if (is_rt_policy(policy)) { +- unsigned long rlim_rtprio; +- unsigned long flags; +- +- if (!lock_task_sighand(p, &flags)) +- return -ESRCH; +- rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; +- unlock_task_sighand(p, &flags); +- + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; +@@ -4401,6 +4617,31 @@ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy == SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } + } + + /* can't change other user's priorities */ +@@ -4409,6 +4650,11 @@ + return -EPERM; + } + ++ if (!(p->mm) && policy == SCHED_IDLEPRIO) { ++ /* Don't allow kernel threads to be SCHED_IDLEPRIO. */ ++ return -EINVAL; ++ } ++ + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; +@@ -4429,12 +4675,12 @@ + spin_unlock_irqrestore(&p->pi_lock, flags); + goto recheck; + } +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, param->sched_priority); +- if (array) { ++ if (queued) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and +@@ -4444,14 +4690,15 @@ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); +- } else if (TASK_PREEMPTS_CURR(p, rq)) +- resched_task(rq->curr); ++ } else ++ try_preempt(p, rq); + } + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + ++out: + return 0; + } + EXPORT_SYMBOL_GPL(sched_setscheduler); +@@ -4718,41 +4965,34 @@ + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU by moving the calling thread +- * to the expired array. If there are no other threads running on this +- * CPU then this function will return. ++ * to the expired array if SCHED_NORMAL or the end of its current priority ++ * queue if a realtime task. If there are no other threads running on this ++ * cpu this function will return. + */ + asmlinkage long sys_sched_yield(void) + { + struct rq *rq = this_rq_lock(); +- struct prio_array *array = current->array, *target = rq->expired; ++ struct task_struct *p = current; + + schedstat_inc(rq, yld_cnt); +- /* +- * We implement yielding by moving the task into the expired +- * queue. +- * +- * (special rule: RT tasks will just roundrobin in the active +- * array.) +- */ +- if (rt_task(current)) +- target = rq->active; +- +- if (array->nr_active == 1) { +- schedstat_inc(rq, yld_act_empty); +- if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_both_empty); +- } else if (!rq->expired->nr_active) +- schedstat_inc(rq, yld_exp_empty); +- +- if (array != target) { +- dequeue_task(current, array); +- enqueue_task(current, target); +- } else +- /* +- * requeue_task is cheaper so perform that if possible. +- */ +- requeue_task(current, array); ++ if (rq->nr_running == 1) ++ schedstat_inc(rq, yld_both_empty); ++ else { ++ struct prio_array *old_array = p->array; ++ int old_prio = p->prio; ++ ++ if (idleprio_task(p)) { ++ dequeue_task(p, rq); ++ enqueue_task(p, rq); ++ goto out_release; ++ } ++ /* p->prio will be updated in requeue_task via queue_expired */ ++ if (!rt_task(p)) ++ p->array = rq->expired; ++ requeue_task(p, rq, old_array, old_prio); ++ } + ++out_release: + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: +@@ -4902,6 +5142,8 @@ + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + break; + } +@@ -4926,6 +5168,8 @@ + break; + case SCHED_NORMAL: + case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: + ret = 0; + } + return ret; +@@ -4959,8 +5203,8 @@ + if (retval) + goto out_unlock; + +- jiffies_to_timespec(p->policy == SCHED_FIFO ? +- 0 : task_timeslice(p), &t); ++ t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 : ++ MS_TO_NS(task_timeslice(p))); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + out_nounlock: +@@ -5056,10 +5300,10 @@ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + +- idle->timestamp = sched_clock(); +- idle->sleep_avg = 0; +- idle->array = NULL; +- idle->prio = idle->normal_prio = MAX_PRIO; ++ bitmap_zero(idle->bitmap, PRIO_RANGE); ++ idle->timestamp = idle->last_ran = sched_clock(); ++ idle->array = rq->active; ++ idle->prio = idle->normal_prio = NICE_TO_PRIO(0); + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); + set_task_cpu(idle, cpu); +@@ -5178,7 +5422,7 @@ + goto out; + + set_task_cpu(p, dest_cpu); +- if (p->array) { ++ if (task_queued(p)) { + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step +@@ -5189,8 +5433,7 @@ + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + __activate_task(p, rq_dest); +- if (TASK_PREEMPTS_CURR(p, rq_dest)) +- resched_task(rq_dest->curr); ++ try_preempt(p, rq_dest); + } + ret = 1; + out: +@@ -5487,7 +5730,7 @@ + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); +- rq->idle->static_prio = MAX_PRIO; ++ rq->idle->static_prio = NICE_TO_PRIO(0); + __setscheduler(rq->idle, SCHED_NORMAL, 0); + migrate_dead_tasks(cpu); + task_rq_unlock(rq, &flags); +@@ -7013,6 +7256,13 @@ + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed(current, non_isolated_cpus) < 0) + BUG(); ++ ++ /* ++ * Assume that every added cpu gives us slightly less overall latency ++ * allowing us to increase the base rr_interval, but in a non linear ++ * fashion. ++ */ ++ rr_interval *= 1 + ilog2(num_online_cpus()); + } + #else + void __init sched_init_smp(void) +@@ -7035,6 +7285,16 @@ + int i, j, k; + int highest_cpu = 0; + ++ /* Generate the priority matrix */ ++ for (i = 0; i < PRIO_RANGE; i++) { ++ bitmap_fill(prio_matrix[i], PRIO_RANGE); ++ j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i); ++ for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) { ++ __clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE), ++ prio_matrix[i]); ++ } ++ } ++ + for_each_possible_cpu(i) { + struct prio_array *array; + struct rq *rq; +@@ -7042,12 +7302,20 @@ + rq = cpu_rq(i); + spin_lock_init(&rq->lock); + lockdep_set_class(&rq->lock, &rq->rq_lock_key); ++ rq->iso_ticks = 0; + rq->nr_running = 0; ++ rq->nr_idleprio = 0; ++ rq->prio_rotation = 0; + rq->active = rq->arrays; ++ rq->idleprio = rq->active; + rq->expired = rq->arrays + 1; +- rq->best_expired_prio = MAX_PRIO; ++ reset_prio_levels(rq); ++ rq->dyn_bitmap = rq->active->prio_bitmap; ++ rq->exp_bitmap = rq->expired->prio_bitmap; + + #ifdef CONFIG_SMP ++ rq->active->rq = rq; ++ rq->expired->rq = rq; + rq->sd = NULL; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; +@@ -7060,17 +7328,16 @@ + atomic_set(&rq->nr_iowait, 0); + + for (j = 0; j < 2; j++) { ++ + array = rq->arrays + j; +- for (k = 0; k < MAX_PRIO; k++) { ++ for (k = 0; k <= MAX_PRIO; k++) + INIT_LIST_HEAD(array->queue + k); +- __clear_bit(k, array->bitmap); +- } +- // delimiter for bitsearch +- __set_bit(MAX_PRIO, array->bitmap); ++ bitmap_zero(array->prio_bitmap, MAX_PRIO); ++ /* delimiter for bitsearch */ ++ __set_bit(MAX_PRIO, array->prio_bitmap); + } + highest_cpu = i; + } +- + set_load_weight(&init_task); + + #ifdef CONFIG_SMP +@@ -7125,25 +7392,25 @@ + #ifdef CONFIG_MAGIC_SYSRQ + void normalize_rt_tasks(void) + { +- struct prio_array *array; + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; ++ int queued; + + read_lock_irq(&tasklist_lock); + + do_each_thread(g, p) { +- if (!rt_task(p)) ++ if (!rt_task(p) && !iso_task(p)) + continue; + + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); + +- array = p->array; +- if (array) ++ queued = task_queued(p); ++ if (queued) + deactivate_task(p, task_rq(p)); + __setscheduler(p, SCHED_NORMAL, 0); +- if (array) { ++ if (queued) { + __activate_task(p, task_rq(p)); + resched_task(rq->curr); + } +Index: linux-2.6.22-ck1/kernel/sysctl.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/sysctl.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/sysctl.c 2007-07-10 14:55:23.000000000 +1000 +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -70,6 +71,7 @@ + extern char core_pattern[]; + extern int pid_max; + extern int min_free_kbytes; ++extern int vm_tail_largefiles; + extern int printk_ratelimit_jiffies; + extern int printk_ratelimit_burst; + extern int pid_max_min, pid_max_max; +@@ -78,6 +80,10 @@ + extern int compat_log; + extern int maps_protect; + extern int sysctl_stat_interval; ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_iso_period; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ + static int maxolduid = 65535; +@@ -161,6 +167,14 @@ + #endif + + ++/* Constants for minimum and maximum testing. ++ We use these as one-element integer vectors. */ ++static int __read_mostly zero; ++static int __read_mostly one = 1; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly five_thousand = 5000; ++ ++ + /* The default sysctl tables: */ + + static ctl_table root_table[] = { +@@ -501,6 +515,47 @@ + .mode = 0444, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &five_thousand, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "iso_period", ++ .data = &sched_iso_period, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .strategy = &sysctl_intvec, ++ .extra1 = &one, ++ .extra2 = &one_hundred, ++ }, + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .ctl_name = KERN_UNKNOWN_NMI_PANIC, +@@ -619,14 +674,16 @@ + { .ctl_name = 0 } + }; + +-/* Constants for minimum and maximum testing in vm_table. +- We use these as one-element integer vectors. */ +-static int zero; +-static int one_hundred = 100; +- +- + static ctl_table vm_table[] = { + { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "tail_largefiles", ++ .data = &vm_tail_largefiles, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { + .ctl_name = VM_OVERCOMMIT_MEMORY, + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, +@@ -705,16 +762,24 @@ + .proc_handler = &proc_dointvec, + }, + { +- .ctl_name = VM_SWAPPINESS, +- .procname = "swappiness", +- .data = &vm_swappiness, +- .maxlen = sizeof(vm_swappiness), ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "mapped", ++ .data = &vm_mapped, ++ .maxlen = sizeof(vm_mapped), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "hardmaplimit", ++ .data = &vm_hardmaplimit, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, + #ifdef CONFIG_HUGETLB_PAGE + { + .ctl_name = VM_HUGETLB_PAGES, +@@ -882,6 +947,32 @@ + .extra1 = &zero, + }, + #endif ++#ifdef CONFIG_SWAP_PREFETCH ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch", ++ .data = &swap_prefetch, ++ .maxlen = sizeof(swap_prefetch), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch_delay", ++ .data = &swap_prefetch_delay, ++ .maxlen = sizeof(swap_prefetch_delay), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "swap_prefetch_sleep", ++ .data = &swap_prefetch_sleep, ++ .maxlen = sizeof(swap_prefetch_sleep), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { .ctl_name = 0 } + }; + +Index: linux-2.6.22-ck1/Documentation/sched-design.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/sched-design.txt 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/sched-design.txt 2007-07-10 14:55:02.000000000 +1000 +@@ -1,11 +1,14 @@ +- Goals, Design and Implementation of the +- new ultra-scalable O(1) scheduler ++ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by ++ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by ++ Con Kolivas. + + +- This is an edited version of an email Ingo Molnar sent to +- lkml on 4 Jan 2002. It describes the goals, design, and +- implementation of Ingo's new ultra-scalable O(1) scheduler. +- Last Updated: 18 April 2002. ++ This was originally an edited version of an email Ingo Molnar sent to ++ lkml on 4 Jan 2002. It describes the goals, design, and implementation ++ of Ingo's ultra-scalable O(1) scheduler. It now contains a description ++ of the Staircase Deadline priority scheduler that was built on this ++ design. ++ Last Updated: Fri, 4 May 2007 + + + Goal +@@ -163,3 +166,222 @@ + code is smaller than the old one. + + Ingo ++ ++ ++Staircase Deadline cpu scheduler policy ++================================================ ++ ++Design summary ++============== ++ ++A novel design which incorporates a foreground-background descending priority ++system (the staircase) via a bandwidth allocation matrix according to nice ++level. ++ ++ ++Features ++======== ++ ++A starvation free, strict fairness O(1) scalable design with interactivity ++as good as the above restrictions can provide. There is no interactivity ++estimator, no sleep/run measurements and only simple fixed accounting. ++The design has strict enough a design and accounting that task behaviour ++can be modelled and maximum scheduling latencies can be predicted by ++the virtual deadline mechanism that manages runqueues. The prime concern ++in this design is to maintain fairness at all costs determined by nice level, ++yet to maintain as good interactivity as can be allowed within the ++constraints of strict fairness. ++ ++ ++Design description ++================== ++ ++SD works off the principle of providing each task a quota of runtime that it is ++allowed to run at a number of priority levels determined by its static priority ++(ie. its nice level). If the task uses up its quota it has its priority ++decremented to the next level determined by a priority matrix. Once every ++runtime quota has been consumed of every priority level, a task is queued on the ++"expired" array. When no other tasks exist with quota, the expired array is ++activated and fresh quotas are handed out. This is all done in O(1). ++ ++Design details ++============== ++ ++Each task keeps a record of its own entitlement of cpu time. Most of the rest of ++these details apply to non-realtime tasks as rt task management is straight ++forward. ++ ++Each runqueue keeps a record of what major epoch it is up to in the ++rq->prio_rotation field which is incremented on each major epoch. It also ++keeps a record of the current prio_level for each static priority task. ++ ++Each task keeps a record of what major runqueue epoch it was last running ++on in p->rotation. It also keeps a record of what priority levels it has ++already been allocated quota from during this epoch in a bitmap p->bitmap. ++ ++The only tunable that determines all other details is the RR_INTERVAL. This ++is set to 8ms, and is scaled gently upwards with more cpus. This value is ++tunable via a /proc interface. ++ ++All tasks are initially given a quota based on RR_INTERVAL. This is equal to ++RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and ++progressively larger for nice values from -1 to -20. This is assigned to ++p->quota and only changes with changes in nice level. ++ ++As a task is first queued, it checks in recalc_task_prio to see if it has run at ++this runqueue's current priority rotation. If it has not, it will have its ++p->prio level set according to the first slot in a "priority matrix" and will be ++given a p->time_slice equal to the p->quota, and has its allocation bitmap bit ++set in p->bitmap for this prio level. It is then queued on the current active ++priority array. ++ ++If a task has already been running during this major epoch, and it has ++p->time_slice left and the rq->prio_quota for the task's p->prio still ++has quota, it will be placed back on the active array, but no more quota ++will be added. ++ ++If a task has been running during this major epoch, but does not have ++p->time_slice left, it will find the next lowest priority in its bitmap that it ++has not been allocated quota from. It then gets the a full quota in ++p->time_slice. It is then queued on the current active priority array at the ++newly determined lower priority. ++ ++If a task has been running during this major epoch, and does not have ++any entitlement left in p->bitmap and no time_slice left, it will have its ++bitmap cleared, and be queued at its best prio again, but on the expired ++priority array. ++ ++When a task is queued, it has its relevant bit set in the array->prio_bitmap. ++ ++p->time_slice is stored in nanosconds and is updated via update_cpu_clock on ++schedule() and scheduler_tick. If p->time_slice is below zero then the ++recalc_task_prio is readjusted and the task rescheduled. ++ ++ ++Priority Matrix ++=============== ++ ++In order to minimise the latencies between tasks of different nice levels ++running concurrently, the dynamic priority slots where different nice levels ++are queued are dithered instead of being sequential. What this means is that ++there are 40 priority slots where a task may run during one major rotation, ++and the allocation of slots is dependant on nice level. In the ++following table, a zero represents a slot where the task may run. ++ ++PRIORITY:0..................20.................39 ++nice -20 0000000000000000000000000000000000000000 ++nice -10 1000100010001000100010001000100010010000 ++nice 0 1010101010101010101010101010101010101010 ++nice 5 1011010110110101101101011011010110110110 ++nice 10 1110111011101110111011101110111011101110 ++nice 15 1111111011111110111111101111111011111110 ++nice 19 1111111111111111111111111111111111111110 ++ ++As can be seen, a nice -20 task runs in every priority slot whereas a nice 19 ++task only runs one slot per major rotation. This dithered table allows for the ++smallest possible maximum latencies between tasks of varying nice levels, thus ++allowing vastly different nice levels to be used. ++ ++SCHED_BATCH tasks are managed slightly differently, receiving only the top ++slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but ++slightly higher latencies. ++ ++ ++Modelling deadline behaviour ++============================ ++ ++As the accounting in this design is hard and not modified by sleep average ++calculations or interactivity modifiers, it is possible to accurately ++predict the maximum latency that a task may experience under different ++conditions. This is a virtual deadline mechanism enforced by mandatory ++timeslice expiration and not outside bandwidth measurement. ++ ++The maximum duration a task can run during one major epoch is determined by its ++nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL ++duration during each epoch. Nice 10 tasks can run at 9 priority levels for each ++epoch, and so on. The table in the priority matrix above demonstrates how this ++is enforced. ++ ++Therefore the maximum duration a runqueue epoch can take is determined by ++the number of tasks running, and their nice level. After that, the maximum ++duration it can take before a task can wait before it get scheduled is ++determined by the position of its first slot on the matrix. ++ ++In the following examples, these are _worst case scenarios_ and would rarely ++occur, but can be modelled nonetheless to determine the maximum possible ++latency. ++ ++So for example, if two nice 0 tasks are running, and one has just expired as ++another is activated for the first time receiving a full quota for this ++runqueue rotation, the first task will wait: ++ ++nr_tasks * max_duration + nice_difference * rr_interval ++1 * 19 * RR_INTERVAL + 0 = 152ms ++ ++In the presence of a nice 10 task, a nice 0 task would wait a maximum of ++1 * 10 * RR_INTERVAL + 0 = 80ms ++ ++In the presence of a nice 0 task, a nice 10 task would wait a maximum of ++1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms ++ ++More useful than these values, though, are the average latencies which are ++a matter of determining the average distance between priority slots of ++different nice values and multiplying them by the tasks' quota. For example ++in the presence of a nice -10 task, a nice 0 task will wait either one or ++two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL, ++this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or ++20 and 40ms respectively (on uniprocessor at 1000HZ). ++ ++ ++Achieving interactivity ++======================= ++ ++A requirement of this scheduler design was to achieve good interactivity ++despite being a completely fair deadline based design. The disadvantage of ++designs that try to achieve interactivity is that they usually do so at ++the expense of maintaining fairness. As cpu speeds increase, the requirement ++for some sort of metered unfairness towards interactive tasks becomes a less ++desirable phenomenon, but low latency and fairness remains mandatory to ++good interactive performance. ++ ++This design relies on the fact that interactive tasks, by their nature, ++sleep often. Most fair scheduling designs end up penalising such tasks ++indirectly giving them less than their fair possible share because of the ++sleep, and have to use a mechanism of bonusing their priority to offset ++this based on the duration they sleep. This becomes increasingly inaccurate ++as the number of running tasks rises and more tasks spend time waiting on ++runqueues rather than sleeping, and it is impossible to tell whether the ++task that's waiting on a runqueue only intends to run for a short period and ++then sleep again after than runqueue wait. Furthermore, all such designs rely ++on a period of time to pass to accumulate some form of statistic on the task ++before deciding on how much to give them preference. The shorter this period, ++the more rapidly bursts of cpu ruin the interactive tasks behaviour. The ++longer this period, the longer it takes for interactive tasks to get low ++scheduling latencies and fair cpu. ++ ++This design does not measure sleep time at all. Interactive tasks that sleep ++often will wake up having consumed very little if any of their quota for ++the current major priority rotation. The longer they have slept, the less ++likely they are to even be on the current major priority rotation. Once ++woken up, though, they get to use up a their full quota for that epoch, ++whether part of a quota remains or a full quota. Overall, however, they ++can still only run as much cpu time for that epoch as any other task of the ++same nice level. This means that two tasks behaving completely differently ++from fully cpu bound to waking/sleeping extremely frequently will still ++get the same quota of cpu, but the latter will be using its quota for that ++epoch in bursts rather than continuously. This guarantees that interactive ++tasks get the same amount of cpu as cpu bound ones. ++ ++The other requirement of interactive tasks is also to obtain low latencies ++for when they are scheduled. Unlike fully cpu bound tasks and the maximum ++latencies possible described in the modelling deadline behaviour section ++above, tasks that sleep will wake up with quota available usually at the ++current runqueue's priority_level or better. This means that the most latency ++they are likely to see is one RR_INTERVAL, and often they will preempt the ++current task if it is not of a sleeping nature. This then guarantees very ++low latency for interactive tasks, and the lowest latencies for the least ++cpu bound tasks. ++ ++ ++Fri, 4 May 2007 ++Con Kolivas +Index: linux-2.6.22-ck1/Documentation/sysctl/kernel.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/sysctl/kernel.txt 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/sysctl/kernel.txt 2007-07-10 14:55:20.000000000 +1000 +@@ -25,6 +25,9 @@ + - domainname + - hostname + - hotplug ++- interactive ++- iso_cpu ++- iso_period + - java-appletviewer [ binfmt_java, obsolete ] + - java-interpreter [ binfmt_java, obsolete ] + - kstack_depth_to_print [ X86 only ] +@@ -43,6 +46,7 @@ + - printk + - real-root-dev ==> Documentation/initrd.txt + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sem +@@ -164,6 +168,40 @@ + + ============================================================== + ++interactive: ++ ++The staircase-deadline cpu scheduler can be set in either purely ++forward-looking mode for absolutely rigid fairness and cpu distribution ++according to nice level, or it can allow a small per-process history ++to smooth out cpu usage perturbations common in interactive tasks by ++enabling this sysctl. While small fairness issues can arise with this ++enabled, overall fairness is usually still strongly maintained and ++starvation is never possible. Enabling this can significantly smooth ++out 3d graphics and games. ++ ++Default value is 1 (enabled). ++ ++============================================================== ++ ++iso_cpu: ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling iso_period ++seconds. ++ ++Set to 80 (percent) by default. ++ ++============================================================== ++ ++iso_period: ++ ++This sets the number of seconds over which SCHED_ISO cpu usage is averaged ++to see if it exceeds its allocated cpu bandwidth. ++ ++Set to 5 (seconds) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -288,6 +326,19 @@ + + ============================================================== + ++rr_interval: ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. This value is in milliseconds and the default value chosen ++depends on the number of cpus available at scheduler initialisation ++with a minimum of 8. ++ ++Valid values are from 1-5000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +Index: linux-2.6.22-ck1/fs/pipe.c +=================================================================== +--- linux-2.6.22-ck1.orig/fs/pipe.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/fs/pipe.c 2007-07-10 14:55:02.000000000 +1000 +@@ -41,12 +41,7 @@ + { + DEFINE_WAIT(wait); + +- /* +- * Pipes are system-local resources, so sleeping on them +- * is considered a noninteractive wait: +- */ +- prepare_to_wait(&pipe->wait, &wait, +- TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); ++ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + schedule(); +Index: linux-2.6.22-ck1/fs/proc/array.c +=================================================================== +--- linux-2.6.22-ck1.orig/fs/proc/array.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/fs/proc/array.c 2007-07-10 14:55:02.000000000 +1000 +@@ -165,7 +165,6 @@ + rcu_read_lock(); + buffer += sprintf(buffer, + "State:\t%s\n" +- "SleepAVG:\t%lu%%\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" +@@ -173,7 +172,6 @@ + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), +- (p->sleep_avg/1024)*100/(1020000000/1024), + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, +Index: linux-2.6.22-ck1/include/linux/init_task.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/init_task.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/init_task.h 2007-07-10 14:55:20.000000000 +1000 +@@ -125,13 +125,15 @@ + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ + .normal_prio = MAX_PRIO-20, \ ++ .rotation = 0, \ + .policy = SCHED_NORMAL, \ + .cpus_allowed = CPU_MASK_ALL, \ + .mm = NULL, \ + .active_mm = &init_mm, \ + .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .ioprio = 0, \ +- .time_slice = HZ, \ ++ .time_slice = 1000000000, \ ++ .quota = 1000000000, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ + .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ +@@ -158,6 +160,7 @@ + .signal = {{0}}}, \ + .blocked = {{0}}, \ + .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ ++ .mutexes_held = 0, \ + .journal_info = NULL, \ + .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ + .fs_excl = ATOMIC_INIT(0), \ +Index: linux-2.6.22-ck1/kernel/softirq.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/softirq.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/softirq.c 2007-07-10 14:55:02.000000000 +1000 +@@ -488,7 +488,7 @@ + + static int ksoftirqd(void * __bind_cpu) + { +- set_user_nice(current, 19); ++ set_user_nice(current, 15); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); +Index: linux-2.6.22-ck1/kernel/workqueue.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/workqueue.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/workqueue.c 2007-07-10 14:55:02.000000000 +1000 +@@ -285,8 +285,6 @@ + if (!cwq->wq->freezeable) + current->flags |= PF_NOFREEZE; + +- set_user_nice(current, -5); +- + for (;;) { + prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); + if (!freezing(current) && +Index: linux-2.6.22-ck1/kernel/kthread.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/kthread.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/kthread.c 2007-07-10 14:55:02.000000000 +1000 +@@ -223,7 +223,6 @@ + + ignore_signals(tsk); + +- set_user_nice(tsk, -5); + set_cpus_allowed(tsk, CPU_MASK_ALL); + } + +Index: linux-2.6.22-ck1/kernel/fork.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/fork.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/fork.c 2007-07-10 14:55:20.000000000 +1000 +@@ -1063,6 +1063,7 @@ + p->io_context = NULL; + p->io_wait = NULL; + p->audit_context = NULL; ++ p->mutexes_held = 0; + cpuset_fork(p); + #ifdef CONFIG_NUMA + p->mempolicy = mpol_copy(p->mempolicy); +Index: linux-2.6.22-ck1/kernel/mutex.c +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/mutex.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/mutex.c 2007-07-10 14:55:20.000000000 +1000 +@@ -60,6 +60,16 @@ + static void fastcall noinline __sched + __mutex_lock_slowpath(atomic_t *lock_count); + ++static inline void inc_mutex_count(void) ++{ ++ current->mutexes_held++; ++} ++ ++static inline void dec_mutex_count(void) ++{ ++ current->mutexes_held--; ++} ++ + /*** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired +@@ -89,6 +99,7 @@ + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); ++ inc_mutex_count(); + } + + EXPORT_SYMBOL(mutex_lock); +@@ -114,6 +125,7 @@ + * into 'unlocked' state: + */ + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); ++ dec_mutex_count(); + } + + EXPORT_SYMBOL(mutex_unlock); +@@ -283,9 +295,14 @@ + */ + int fastcall __sched mutex_lock_interruptible(struct mutex *lock) + { ++ int ret; ++ + might_sleep(); +- return __mutex_fastpath_lock_retval ++ ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_interruptible_slowpath); ++ if (likely(!ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_lock_interruptible); +@@ -340,8 +357,12 @@ + */ + int fastcall __sched mutex_trylock(struct mutex *lock) + { +- return __mutex_fastpath_trylock(&lock->count, ++ int ret = __mutex_fastpath_trylock(&lock->count, + __mutex_trylock_slowpath); ++ ++ if (likely(ret)) ++ inc_mutex_count(); ++ return ret; + } + + EXPORT_SYMBOL(mutex_trylock); +Index: linux-2.6.22-ck1/block/cfq-iosched.c +=================================================================== +--- linux-2.6.22-ck1.orig/block/cfq-iosched.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/block/cfq-iosched.c 2007-07-10 14:55:21.000000000 +1000 +@@ -1276,10 +1276,12 @@ + printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* +- * no prio set, place us in the middle of the BE classes ++ * Select class and ioprio according to policy and nice + */ ++ cfqq->ioprio_class = task_policy_ioprio_class(tsk); + cfqq->ioprio = task_nice_ioprio(tsk); +- cfqq->ioprio_class = IOPRIO_CLASS_BE; ++ if (cfqq->ioprio_class == IOPRIO_CLASS_IDLE) ++ cfq_clear_cfqq_idle_window(cfqq); + break; + case IOPRIO_CLASS_RT: + cfqq->ioprio = task_ioprio(tsk); +Index: linux-2.6.22-ck1/include/linux/ioprio.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/ioprio.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/ioprio.h 2007-07-10 14:55:21.000000000 +1000 +@@ -22,7 +22,7 @@ + * class, the default for any process. IDLE is the idle scheduling class, it + * is only served when no one else is using the disk. + */ +-enum { ++enum ioprio_class { + IOPRIO_CLASS_NONE, + IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE, +@@ -51,8 +51,25 @@ + return IOPRIO_PRIO_DATA(task->ioprio); + } + ++static inline enum ioprio_class ++ task_policy_ioprio_class(struct task_struct *task) ++{ ++ if (rt_task(task)) ++ return IOPRIO_CLASS_RT; ++ if (idleprio_task(task)) ++ return IOPRIO_CLASS_IDLE; ++ return IOPRIO_CLASS_BE; ++} ++ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (rt_task(task)) ++ return (MAX_RT_PRIO - task->rt_priority) * IOPRIO_BE_NR / ++ (MAX_RT_PRIO + 1); ++ if (iso_task(task)) ++ return 0; ++ if (idleprio_task(task)) ++ return IOPRIO_BE_NR - 1; + return (task_nice(task) + 20) / 5; + } + +Index: linux-2.6.22-ck1/Documentation/sysctl/vm.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/sysctl/vm.txt 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/sysctl/vm.txt 2007-07-10 14:55:23.000000000 +1000 +@@ -22,6 +22,8 @@ + - dirty_background_ratio + - dirty_expire_centisecs + - dirty_writeback_centisecs ++- hardmaplimit ++- mapped + - max_map_count + - min_free_kbytes + - laptop_mode +@@ -31,12 +33,15 @@ + - min_unmapped_ratio + - min_slab_ratio + - panic_on_oom ++- swap_prefetch ++- swap_prefetch_delay ++- swap_prefetch_sleep + + ============================================================== + + dirty_ratio, dirty_background_ratio, dirty_expire_centisecs, + dirty_writeback_centisecs, vfs_cache_pressure, laptop_mode, +-block_dump, swap_token_timeout, drop-caches: ++block_dump, swap_token_timeout, drop-caches, tail_largefiles: + + See Documentation/filesystems/proc.txt + +@@ -86,6 +91,27 @@ + + ============================================================== + ++hardmaplimit: ++ ++This flag makes the vm adhere to the mapped value as closely as possible ++except in the most extreme vm stress where doing so would provoke an out ++of memory condition (see mapped below). ++ ++Enabled by default. ++ ++============================================================== ++ ++mapped: ++ ++This is the percentage ram that is filled with mapped pages (applications) ++before the vm will start reclaiming mapped pages by moving them to swap. ++It is altered by the relative stress of the vm at the time so is not ++strictly adhered to to prevent provoking out of memory kills. ++ ++Set to 66 by default. ++ ++============================================================== ++ + max_map_count: + + This file contains the maximum number of memory map areas a process +@@ -216,3 +242,37 @@ + The default value is 0. + 1 and 2 are for failover of clustering. Please select either + according to your policy of failover. ++ ++============================================================== ++ ++swap_prefetch ++ ++This enables or disables the swap prefetching feature. When the virtual ++memory subsystem has been extremely idle for at least swap_prefetch_sleep ++seconds it will start copying back pages from swap into the swapcache and keep ++a copy in swap. Valid values are 0 - 3. A value of 0 disables swap ++prefetching, 1 enables it unless laptop_mode is enabled, 2 enables it in the ++presence of laptop_mode, and 3 enables it unconditionally, ignoring whether ++the system is idle or not. If set to 0, swap prefetch wil not even try to keep ++record of ram swapped out to have the most minimal impact on performance. ++ ++The default value is 1. ++ ++============================================================== ++ ++swap_prefetch_delay ++ ++This is the time in seconds that swap prefetching is delayed upon finding ++the system is not idle (ie the vm is busy or non-niced cpu load is present). ++ ++The default value is 1. ++ ++============================================================== ++ ++swap_prefetch_sleep ++ ++This is the time in seconds that the swap prefetch kernel thread is put to ++sleep for when the ram is found to be full and it is unable to prefetch ++further. ++ ++The default value is 5. +Index: linux-2.6.22-ck1/include/linux/swap.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/swap.h 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/swap.h 2007-07-10 14:55:22.000000000 +1000 +@@ -180,6 +180,7 @@ + /* linux/mm/swap.c */ + extern void FASTCALL(lru_cache_add(struct page *)); + extern void FASTCALL(lru_cache_add_active(struct page *)); ++extern void FASTCALL(lru_cache_add_tail(struct page *)); + extern void FASTCALL(activate_page(struct page *)); + extern void FASTCALL(mark_page_accessed(struct page *)); + extern void lru_add_drain(void); +@@ -188,9 +189,11 @@ + extern void swap_setup(void); + + /* linux/mm/vmscan.c */ +-extern unsigned long try_to_free_pages(struct zone **, gfp_t); ++extern unsigned long try_to_free_pages(struct zone **, gfp_t, ++ struct task_struct *p); + extern unsigned long shrink_all_memory(unsigned long nr_pages); +-extern int vm_swappiness; ++extern int vm_mapped; ++extern int vm_hardmaplimit; + extern int remove_mapping(struct address_space *mapping, struct page *page); + extern long vm_total_pages; + +@@ -237,6 +240,7 @@ + extern struct page * lookup_swap_cache(swp_entry_t); + extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *vma, + unsigned long addr); ++extern int add_to_swap_cache(struct page *page, swp_entry_t entry); + /* linux/mm/swapfile.c */ + extern long total_swap_pages; + extern unsigned int nr_swapfiles; +Index: linux-2.6.22-ck1/init/Kconfig +=================================================================== +--- linux-2.6.22-ck1.orig/init/Kconfig 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/init/Kconfig 2007-07-10 14:55:22.000000000 +1000 +@@ -105,6 +105,28 @@ + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + ++config SWAP_PREFETCH ++ bool "Support for prefetching swapped memory" ++ depends on SWAP ++ default y ++ ---help--- ++ This option will allow the kernel to prefetch swapped memory pages ++ when idle. The pages will be kept on both swap and in swap_cache ++ thus avoiding the need for further I/O if either ram or swap space ++ is required. ++ ++ What this will do on workstations is slowly bring back applications ++ that have swapped out after memory intensive workloads back into ++ physical ram if you have free ram at a later stage and the machine ++ is relatively idle. This means that when you come back to your ++ computer after leaving it idle for a while, applications will come ++ to life faster. Note that your swap usage will appear to increase ++ but these are cached pages, can be dropped freely by the vm, and it ++ should stabilise around 50% swap usage maximum. ++ ++ Workstations and multiuser workstation servers will most likely want ++ to say Y. ++ + config SYSVIPC + bool "System V IPC" + ---help--- +Index: linux-2.6.22-ck1/mm/Makefile +=================================================================== +--- linux-2.6.22-ck1.orig/mm/Makefile 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/Makefile 2007-07-10 14:55:22.000000000 +1000 +@@ -17,6 +17,7 @@ + obj-y += bounce.o + endif + obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o ++obj-$(CONFIG_SWAP_PREFETCH) += swap_prefetch.o + obj-$(CONFIG_HUGETLBFS) += hugetlb.o + obj-$(CONFIG_NUMA) += mempolicy.o + obj-$(CONFIG_SPARSEMEM) += sparse.o +Index: linux-2.6.22-ck1/mm/swap.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/swap.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/swap.c 2007-07-10 14:55:23.000000000 +1000 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -176,6 +177,7 @@ + */ + static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; + static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; ++static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, }; + + void fastcall lru_cache_add(struct page *page) + { +@@ -197,6 +199,31 @@ + put_cpu_var(lru_add_active_pvecs); + } + ++static void __pagevec_lru_add_tail(struct pagevec *pvec) ++{ ++ int i; ++ struct zone *zone = NULL; ++ ++ for (i = 0; i < pagevec_count(pvec); i++) { ++ struct page *page = pvec->pages[i]; ++ struct zone *pagezone = page_zone(page); ++ ++ if (pagezone != zone) { ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ zone = pagezone; ++ spin_lock_irq(&zone->lru_lock); ++ } ++ BUG_ON(PageLRU(page)); ++ SetPageLRU(page); ++ add_page_to_inactive_list_tail(zone, page); ++ } ++ if (zone) ++ spin_unlock_irq(&zone->lru_lock); ++ release_pages(pvec->pages, pvec->nr, pvec->cold); ++ pagevec_reinit(pvec); ++} ++ + static void __lru_add_drain(int cpu) + { + struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); +@@ -207,6 +234,9 @@ + pvec = &per_cpu(lru_add_active_pvecs, cpu); + if (pagevec_count(pvec)) + __pagevec_lru_add_active(pvec); ++ pvec = &per_cpu(lru_add_tail_pvecs, cpu); ++ if (pagevec_count(pvec)) ++ __pagevec_lru_add_tail(pvec); + } + + void lru_add_drain(void) +@@ -403,6 +433,20 @@ + } + + /* ++ * Function used uniquely to put pages back to the lru at the end of the ++ * inactive list to preserve the lru order. ++ */ ++void fastcall lru_cache_add_tail(struct page *page) ++{ ++ struct pagevec *pvec = &get_cpu_var(lru_add_tail_pvecs); ++ ++ page_cache_get(page); ++ if (!pagevec_add(pvec, page)) ++ __pagevec_lru_add_tail(pvec); ++ put_cpu_var(lru_add_pvecs); ++} ++ ++/* + * Try to drop buffers from the pages in a pagevec + */ + void pagevec_strip(struct pagevec *pvec) +@@ -514,6 +558,9 @@ + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ ++ ++ prepare_swap_prefetch(); ++ + #ifdef CONFIG_HOTPLUG_CPU + hotcpu_notifier(cpu_swap_callback, 0); + #endif +Index: linux-2.6.22-ck1/mm/swap_prefetch.c +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.22-ck1/mm/swap_prefetch.c 2007-07-10 14:55:22.000000000 +1000 +@@ -0,0 +1,542 @@ ++/* ++ * linux/mm/swap_prefetch.c ++ * ++ * Copyright (C) 2005-2007 Con Kolivas ++ * ++ * Written by Con Kolivas ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * sysctls: ++ * swap_prefetch: 0. Disable swap prefetching ++ * 1. Prefetch only when idle and not with laptop_mode ++ * 2. Prefetch when idle and with laptop_mode ++ * 3. Prefetch at all times. ++ * swap_prefetch_delay: Number of seconds to delay prefetching when system ++ * is not idle. ++ * swap_prefetch_sleep: Number of seconds to put kprefetchd to sleep when ++ * unable to prefetch. ++ */ ++int swap_prefetch __read_mostly = 1; ++int swap_prefetch_delay __read_mostly = 1; ++int swap_prefetch_sleep __read_mostly = 5; ++ ++#define PREFETCH_DELAY (HZ * swap_prefetch_delay) ++#define PREFETCH_SLEEP ((HZ * swap_prefetch_sleep) ? : 1) ++ ++struct swapped_root { ++ unsigned long busy; /* vm busy */ ++ spinlock_t lock; /* protects all data */ ++ struct list_head list; /* MRU list of swapped pages */ ++ struct radix_tree_root swap_tree; /* Lookup tree of pages */ ++ unsigned int count; /* Number of entries */ ++ unsigned int maxcount; /* Maximum entries allowed */ ++ struct kmem_cache *cache; /* Of struct swapped_entry */ ++}; ++ ++static struct swapped_root swapped = { ++ .lock = SPIN_LOCK_UNLOCKED, ++ .list = LIST_HEAD_INIT(swapped.list), ++ .swap_tree = RADIX_TREE_INIT(GFP_ATOMIC), ++}; ++ ++static struct task_struct *kprefetchd_task; ++ ++/* ++ * We check to see no part of the vm is busy. If it is this will interrupt ++ * trickle_swap and wait another PREFETCH_DELAY. Purposefully racy. ++ */ ++inline void delay_swap_prefetch(void) ++{ ++ if (!test_bit(0, &swapped.busy)) ++ __set_bit(0, &swapped.busy); ++} ++ ++/* ++ * If laptop_mode is enabled don't prefetch to avoid hard drives ++ * doing unnecessary spin-ups unless swap_prefetch is explicitly ++ * set to a higher value. ++ */ ++static inline int prefetch_enabled(void) ++{ ++ if (swap_prefetch <= laptop_mode) ++ return 0; ++ return 1; ++} ++ ++static int kprefetchd_awake; ++ ++/* ++ * Drop behind accounting which keeps a list of the most recently used swap ++ * entries. Entries are removed lazily by kprefetchd. ++ */ ++void add_to_swapped_list(struct page *page) ++{ ++ struct swapped_entry *entry; ++ unsigned long index, flags; ++ ++ if (!prefetch_enabled()) ++ goto out; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (swapped.count >= swapped.maxcount) { ++ /* ++ * Once the number of entries exceeds maxcount we start ++ * removing the least recently used entries. ++ */ ++ entry = list_entry(swapped.list.next, ++ struct swapped_entry, swapped_list); ++ radix_tree_delete(&swapped.swap_tree, entry->swp_entry.val); ++ list_del(&entry->swapped_list); ++ swapped.count--; ++ } else { ++ entry = kmem_cache_alloc(swapped.cache, GFP_ATOMIC); ++ if (unlikely(!entry)) ++ /* bad, can't allocate more mem */ ++ goto out_locked; ++ } ++ ++ index = page_private(page); ++ entry->swp_entry.val = index; ++ /* ++ * On numa we need to store the node id to ensure that we prefetch to ++ * the same node it came from. ++ */ ++ store_swap_entry_node(entry, page); ++ ++ if (likely(!radix_tree_insert(&swapped.swap_tree, index, entry))) { ++ list_add(&entry->swapped_list, &swapped.list); ++ swapped.count++; ++ } else ++ kmem_cache_free(swapped.cache, entry); ++ ++out_locked: ++ spin_unlock_irqrestore(&swapped.lock, flags); ++out: ++ if (!kprefetchd_awake) ++ wake_up_process(kprefetchd_task); ++ return; ++} ++ ++/* ++ * Removes entries from the swapped_list. The radix tree allows us to quickly ++ * look up the entry from the index without having to iterate over the whole ++ * list. ++ */ ++static void remove_from_swapped_list(const unsigned long index) ++{ ++ struct swapped_entry *entry; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ entry = radix_tree_delete(&swapped.swap_tree, index); ++ if (likely(entry)) { ++ list_del(&entry->swapped_list); ++ swapped.count--; ++ kmem_cache_free(swapped.cache, entry); ++ } ++ spin_unlock_irqrestore(&swapped.lock, flags); ++} ++ ++enum trickle_return { ++ TRICKLE_SUCCESS, ++ TRICKLE_FAILED, ++ TRICKLE_DELAY, ++}; ++ ++struct node_stats { ++ /* Free ram after a cycle of prefetching */ ++ unsigned long last_free; ++ /* Free ram on this cycle of checking prefetch_suitable */ ++ unsigned long current_free; ++ /* The amount of free ram before we start prefetching */ ++ unsigned long highfree[MAX_NR_ZONES]; ++ /* The amount of free ram where we will stop prefetching */ ++ unsigned long lowfree[MAX_NR_ZONES]; ++ /* highfree or lowfree depending on whether we've hit a watermark */ ++ unsigned long *pointfree[MAX_NR_ZONES]; ++}; ++ ++/* ++ * prefetch_stats stores the free ram data of each node and this is used to ++ * determine if a node is suitable for prefetching into. ++ */ ++struct prefetch_stats { ++ /* Which nodes are currently suited to prefetching */ ++ nodemask_t prefetch_nodes; ++ /* Total pages we've prefetched on this wakeup of kprefetchd */ ++ unsigned long prefetched_pages; ++ struct node_stats node[MAX_NUMNODES]; ++}; ++ ++static struct prefetch_stats sp_stat; ++ ++/* ++ * This tries to read a swp_entry_t into swap cache for swap prefetching. ++ * If it returns TRICKLE_DELAY we should delay further prefetching. ++ */ ++static enum trickle_return trickle_swap_cache_async(const swp_entry_t entry, ++ const int node) ++{ ++ enum trickle_return ret = TRICKLE_FAILED; ++ unsigned long flags; ++ struct page *page; ++ ++ read_lock_irqsave(&swapper_space.tree_lock, flags); ++ /* Entry may already exist */ ++ page = radix_tree_lookup(&swapper_space.page_tree, entry.val); ++ read_unlock_irqrestore(&swapper_space.tree_lock, flags); ++ if (page) ++ goto out; ++ ++ /* ++ * Get a new page to read from swap. We have already checked the ++ * watermarks so __alloc_pages will not call on reclaim. ++ */ ++ page = alloc_pages_node(node, GFP_HIGHUSER & ~__GFP_WAIT, 0); ++ if (unlikely(!page)) { ++ ret = TRICKLE_DELAY; ++ goto out; ++ } ++ ++ if (add_to_swap_cache(page, entry)) { ++ /* Failed to add to swap cache */ ++ goto out_release; ++ } ++ ++ /* Add them to the tail of the inactive list to preserve LRU order */ ++ lru_cache_add_tail(page); ++ if (unlikely(swap_readpage(NULL, page))) ++ goto out_release; ++ ++ sp_stat.prefetched_pages++; ++ sp_stat.node[node].last_free--; ++ ++ ret = TRICKLE_SUCCESS; ++out_release: ++ page_cache_release(page); ++out: ++ /* ++ * All entries are removed here lazily. This avoids the cost of ++ * remove_from_swapped_list during normal swapin. Thus there are ++ * usually many stale entries. ++ */ ++ remove_from_swapped_list(entry.val); ++ return ret; ++} ++ ++static void clear_last_prefetch_free(void) ++{ ++ int node; ++ ++ /* ++ * Reset the nodes suitable for prefetching to all nodes. We could ++ * update the data to take into account memory hotplug if desired.. ++ */ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->last_free = 0; ++ } ++} ++ ++static void clear_current_prefetch_free(void) ++{ ++ int node; ++ ++ sp_stat.prefetch_nodes = node_online_map; ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ ns->current_free = 0; ++ } ++} ++ ++/* ++ * This updates the high and low watermarks of amount of free ram in each ++ * node used to start and stop prefetching. We prefetch from pages_high * 4 ++ * down to pages_high * 3. ++ */ ++static void examine_free_limits(void) ++{ ++ struct zone *z; ++ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ ns = &sp_stat.node[zone_to_nid(z)]; ++ idx = zone_idx(z); ++ ns->lowfree[idx] = z->pages_high * 3; ++ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high; ++ ++ if (zone_page_state(z, NR_FREE_PAGES) > ns->highfree[idx]) { ++ /* ++ * We've gotten above the high watermark of free pages ++ * so we can start prefetching till we get to the low ++ * watermark. ++ */ ++ ns->pointfree[idx] = &ns->lowfree[idx]; ++ } ++ } ++} ++ ++/* ++ * We want to be absolutely certain it's ok to start prefetching. ++ */ ++static enum trickle_return prefetch_suitable(void) ++{ ++ enum trickle_return ret = TRICKLE_DELAY; ++ struct zone *z; ++ int node; ++ ++ /* ++ * If swap_prefetch is set to a high value we can ignore load ++ * and prefetch whenever we can. Otherwise we test for vm and ++ * cpu activity. ++ */ ++ if (swap_prefetch < 3) { ++ /* Purposefully racy, may return false positive */ ++ if (test_bit(0, &swapped.busy)) { ++ __clear_bit(0, &swapped.busy); ++ goto out; ++ } ++ ++ /* ++ * above_background_load is expensive so we only perform it ++ * every SWAP_CLUSTER_MAX prefetched_pages. ++ * We test to see if we're above_background_load as disk ++ * activity even at low priority can cause interrupt induced ++ * scheduling latencies. ++ */ ++ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX) && ++ above_background_load()) ++ goto out; ++ } ++ clear_current_prefetch_free(); ++ ++ /* ++ * Have some hysteresis between where page reclaiming and prefetching ++ * will occur to prevent ping-ponging between them. ++ */ ++ for_each_zone(z) { ++ struct node_stats *ns; ++ unsigned long free; ++ int idx; ++ ++ if (!populated_zone(z)) ++ continue; ++ ++ node = zone_to_nid(z); ++ ns = &sp_stat.node[node]; ++ idx = zone_idx(z); ++ ++ free = zone_page_state(z, NR_FREE_PAGES); ++ if (free < *ns->pointfree[idx]) { ++ /* ++ * Free pages have dropped below the low watermark so ++ * we won't start prefetching again till we hit the ++ * high watermark of free pages. ++ */ ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ node_clear(node, sp_stat.prefetch_nodes); ++ continue; ++ } ++ ns->current_free += free; ++ } ++ ++ /* ++ * We iterate over each node testing to see if it is suitable for ++ * prefetching and clear the nodemask if it is not. ++ */ ++ for_each_node_mask(node, sp_stat.prefetch_nodes) { ++ struct node_stats *ns = &sp_stat.node[node]; ++ ++ /* ++ * We check to see that pages are not being allocated ++ * elsewhere at any significant rate implying any ++ * degree of memory pressure (eg during file reads) ++ */ ++ if (ns->last_free) { ++ if (ns->current_free + SWAP_CLUSTER_MAX < ++ ns->last_free) { ++ ns->last_free = ns->current_free; ++ node_clear(node, ++ sp_stat.prefetch_nodes); ++ continue; ++ } ++ } else ++ ns->last_free = ns->current_free; ++ ++ /* We shouldn't prefetch when we are doing writeback */ ++ if (node_page_state(node, NR_WRITEBACK)) ++ node_clear(node, sp_stat.prefetch_nodes); ++ } ++ ++ /* Nothing suitable, put kprefetchd back to sleep */ ++ if (nodes_empty(sp_stat.prefetch_nodes)) ++ return TRICKLE_FAILED; ++ ++ /* Survived all that? Hooray we can prefetch! */ ++ ret = TRICKLE_SUCCESS; ++out: ++ return ret; ++} ++ ++/* ++ * trickle_swap is the main function that initiates the swap prefetching. It ++ * first checks to see if the busy flag is set, and does not prefetch if it ++ * is, as the flag implied we are low on memory or swapping in currently. ++ * Otherwise it runs until prefetch_suitable fails which occurs when the ++ * vm is busy, we prefetch to the watermark, the list is empty or we have ++ * iterated over all entries once. ++ */ ++static enum trickle_return trickle_swap(void) ++{ ++ enum trickle_return suitable, ret = TRICKLE_DELAY; ++ struct swapped_entry *pos, *n; ++ unsigned long flags; ++ ++ if (!prefetch_enabled()) ++ return ret; ++ ++ examine_free_limits(); ++ suitable = prefetch_suitable(); ++ if (suitable != TRICKLE_SUCCESS) ++ return suitable; ++ if (list_empty(&swapped.list)) { ++ kprefetchd_awake = 0; ++ return TRICKLE_FAILED; ++ } ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ list_for_each_entry_safe_reverse(pos, n, &swapped.list, swapped_list) { ++ swp_entry_t swp_entry; ++ int node; ++ ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ cond_resched(); ++ suitable = prefetch_suitable(); ++ if (suitable != TRICKLE_SUCCESS) { ++ ret = suitable; ++ goto out_unlocked; ++ } ++ ++ spin_lock_irqsave(&swapped.lock, flags); ++ if (unlikely(!pos)) ++ continue; ++ node = get_swap_entry_node(pos); ++ if (!node_isset(node, sp_stat.prefetch_nodes)) { ++ /* ++ * We found an entry that belongs to a node that is ++ * not suitable for prefetching so skip it. ++ */ ++ continue; ++ } ++ swp_entry = pos->swp_entry; ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++ if (trickle_swap_cache_async(swp_entry, node) == TRICKLE_DELAY) ++ goto out_unlocked; ++ spin_lock_irqsave(&swapped.lock, flags); ++ } ++ spin_unlock_irqrestore(&swapped.lock, flags); ++ ++out_unlocked: ++ if (sp_stat.prefetched_pages) { ++ lru_add_drain(); ++ sp_stat.prefetched_pages = 0; ++ } ++ return ret; ++} ++ ++static int kprefetchd(void *__unused) ++{ ++ struct sched_param param = { .sched_priority = 0 }; ++ ++ sched_setscheduler(current, SCHED_BATCH, ¶m); ++ set_user_nice(current, 19); ++ /* Set ioprio to lowest if supported by i/o scheduler */ ++ sys_ioprio_set(IOPRIO_WHO_PROCESS, IOPRIO_BE_NR - 1, IOPRIO_CLASS_BE); ++ ++ while (!kthread_should_stop()) { ++ try_to_freeze(); ++ ++ if (!kprefetchd_awake) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule(); ++ kprefetchd_awake = 1; ++ } ++ ++ if (trickle_swap() == TRICKLE_FAILED) ++ schedule_timeout_interruptible(PREFETCH_SLEEP); ++ else ++ schedule_timeout_interruptible(PREFETCH_DELAY); ++ clear_last_prefetch_free(); ++ } ++ return 0; ++} ++ ++/* ++ * Create kmem cache for swapped entries ++ */ ++void __init prepare_swap_prefetch(void) ++{ ++ struct zone *zone; ++ ++ swapped.cache = kmem_cache_create("swapped_entry", ++ sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL); ++ ++ /* ++ * We set the limit to more entries than the physical ram. ++ * We remove entries lazily so we need some headroom. ++ */ ++ swapped.maxcount = nr_free_pagecache_pages() * 2; ++ ++ for_each_zone(zone) { ++ struct node_stats *ns; ++ int idx; ++ ++ if (!populated_zone(zone)) ++ continue; ++ ++ ns = &sp_stat.node[zone_to_nid(zone)]; ++ idx = zone_idx(zone); ++ ns->pointfree[idx] = &ns->highfree[idx]; ++ } ++} ++ ++static int __init kprefetchd_init(void) ++{ ++ kprefetchd_task = kthread_run(kprefetchd, NULL, "kprefetchd"); ++ ++ return 0; ++} ++ ++static void __exit kprefetchd_exit(void) ++{ ++ kthread_stop(kprefetchd_task); ++} ++ ++module_init(kprefetchd_init); ++module_exit(kprefetchd_exit); +Index: linux-2.6.22-ck1/mm/swap_state.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/swap_state.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/swap_state.c 2007-07-10 14:55:22.000000000 +1000 +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -95,7 +96,7 @@ + return error; + } + +-static int add_to_swap_cache(struct page *page, swp_entry_t entry) ++int add_to_swap_cache(struct page *page, swp_entry_t entry) + { + int error; + +@@ -148,6 +149,9 @@ + swp_entry_t entry; + int err; + ++ /* Swap prefetching is delayed if we're swapping pages */ ++ delay_swap_prefetch(); ++ + BUG_ON(!PageLocked(page)); + + for (;;) { +@@ -320,6 +324,9 @@ + struct page *found_page, *new_page = NULL; + int err; + ++ /* Swap prefetching is delayed if we're already reading from swap */ ++ delay_swap_prefetch(); ++ + do { + /* + * First check the swap cache. Since this is normally +Index: linux-2.6.22-ck1/mm/vmscan.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/vmscan.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/vmscan.c 2007-07-10 14:55:23.000000000 +1000 +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -36,6 +37,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -63,7 +65,7 @@ + * whole list at once. */ + int swap_cluster_max; + +- int swappiness; ++ int mapped; + + int all_unreclaimable; + }; +@@ -110,9 +112,10 @@ + #endif + + /* +- * From 0 .. 100. Higher means more swappy. ++ * From 0 .. 100. Lower means more swappy. + */ +-int vm_swappiness = 60; ++int vm_mapped __read_mostly = 66; ++int vm_hardmaplimit __read_mostly = 1; + long vm_total_pages; /* The total number of pages which the VM controls */ + + static LIST_HEAD(shrinker_list); +@@ -803,10 +806,14 @@ + * The distress ratio is important - we don't want to start + * going oom. + * +- * A 100% value of vm_swappiness overrides this algorithm +- * altogether. ++ * This distress value is ignored if we apply a hardmaplimit except ++ * in extreme distress. ++ * ++ * A 0% value of vm_mapped overrides this algorithm altogether. + */ +- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; ++ swap_tendency = mapped_ratio * 100 / (sc->mapped + 1); ++ if (!vm_hardmaplimit || distress == 100) ++ swap_tendency += distress; + + /* + * Now use this metric to decide whether to start moving mapped +@@ -955,6 +962,41 @@ + } + + /* ++ * Helper functions to adjust nice level of kswapd, based on the priority of ++ * the task (p) that called it. If it is already higher priority we do not ++ * demote its nice level since it is still working on behalf of a higher ++ * priority task. With kernel threads we leave it at nice 0. ++ * ++ * We don't ever run kswapd real time, so if a real time task calls kswapd we ++ * set it to highest SCHED_NORMAL priority. ++ */ ++static int effective_sc_prio(struct task_struct *p) ++{ ++ if (likely(p->mm)) { ++ if (rt_task(p)) ++ return -20; ++ if (idleprio_task(p)) ++ return 19; ++ return task_nice(p); ++ } ++ return 0; ++} ++ ++static void set_kswapd_nice(struct task_struct *kswapd, struct task_struct *p, ++ int active) ++{ ++ long nice = effective_sc_prio(p); ++ ++ if (task_nice(kswapd) > nice || !active) ++ set_user_nice(kswapd, nice); ++} ++ ++static int sc_priority(struct task_struct *p) ++{ ++ return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); ++} ++ ++/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. +@@ -1011,7 +1053,8 @@ + * holds filesystem locks which prevent writeout this might not work, and the + * allocation attempt will fail. + */ +-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) ++unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask, ++ struct task_struct *p) + { + int priority; + int ret = 0; +@@ -1019,15 +1062,20 @@ + unsigned long nr_reclaimed = 0; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages = 0; +- int i; ++ int i, scan_priority = DEF_PRIORITY; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + ++ if (p) ++ scan_priority = sc_priority(p); ++ ++ delay_swap_prefetch(); ++ + count_vm_event(ALLOCSTALL); + + for (i = 0; zones[i] != NULL; i++) { +@@ -1040,7 +1088,7 @@ + + zone_page_state(zone, NR_INACTIVE); + } + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + sc.nr_scanned = 0; + if (!priority) + disable_swap_token(); +@@ -1070,7 +1118,7 @@ + } + + /* Take a nap, wait for some writeback to complete */ +- if (sc.nr_scanned && priority < DEF_PRIORITY - 2) ++ if (sc.nr_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + } + /* top priority shrink_caches still had more to do? don't OOM, then */ +@@ -1120,9 +1168,9 @@ + */ + static unsigned long balance_pgdat(pg_data_t *pgdat, int order) + { +- int all_zones_ok; ++ int all_zones_ok = 0; + int priority; +- int i; ++ int i, scan_priority; + unsigned long total_scanned; + unsigned long nr_reclaimed; + struct reclaim_state *reclaim_state = current->reclaim_state; +@@ -1130,7 +1178,7 @@ + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = SWAP_CLUSTER_MAX, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + /* + * temp_priority is used to remember the scanning priority at which +@@ -1138,6 +1186,8 @@ + */ + int temp_priority[MAX_NR_ZONES]; + ++ scan_priority = sc_priority(pgdat->kswapd); ++ + loop_again: + total_scanned = 0; + nr_reclaimed = 0; +@@ -1145,9 +1195,9 @@ + count_vm_event(PAGEOUTRUN); + + for (i = 0; i < pgdat->nr_zones; i++) +- temp_priority[i] = DEF_PRIORITY; ++ temp_priority[i] = scan_priority; + +- for (priority = DEF_PRIORITY; priority >= 0; priority--) { ++ for (priority = scan_priority; priority >= 0; priority--) { + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + unsigned long lru_pages = 0; + +@@ -1163,15 +1213,22 @@ + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, +- 0, 0)) { ++ /* ++ * The watermark is relaxed depending on the ++ * level of "priority" till it drops to ++ * pages_high. ++ */ ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ if (!zone_watermark_ok(zone, order, watermark, 0, 0)) { + end_zone = i; + break; + } +@@ -1198,14 +1255,18 @@ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + int nr_slab; ++ unsigned long watermark; + + if (!populated_zone(zone)) + continue; + +- if (zone->all_unreclaimable && priority != DEF_PRIORITY) ++ if (zone->all_unreclaimable && priority != scan_priority) + continue; + +- if (!zone_watermark_ok(zone, order, zone->pages_high, ++ watermark = zone->pages_high + (zone->pages_high * ++ priority / scan_priority); ++ ++ if (!zone_watermark_ok(zone, order, watermark, + end_zone, 0)) + all_zones_ok = 0; + temp_priority[i] = priority; +@@ -1238,7 +1299,7 @@ + * OK, kswapd is getting into trouble. Take a nap, then take + * another pass across the zones. + */ +- if (total_scanned && priority < DEF_PRIORITY - 2) ++ if (total_scanned && priority < scan_priority - 2) + congestion_wait(WRITE, HZ/10); + + /* +@@ -1272,6 +1333,8 @@ + return nr_reclaimed; + } + ++#define WT_EXPIRY (HZ * 5) /* Time to wakeup watermark_timer */ ++ + /* + * The background pageout daemon, started as a kernel thread + * from the init process. +@@ -1319,6 +1382,8 @@ + for ( ; ; ) { + unsigned long new_order; + ++ /* kswapd has been busy so delay watermark_timer */ ++ mod_timer(&pgdat->watermark_timer, jiffies + WT_EXPIRY); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + new_order = pgdat->kswapd_max_order; + pgdat->kswapd_max_order = 0; +@@ -1332,6 +1397,7 @@ + if (!freezing(current)) + schedule(); + ++ set_user_nice(tsk, 0); + order = pgdat->kswapd_max_order; + } + finish_wait(&pgdat->kswapd_wait, &wait); +@@ -1349,9 +1415,10 @@ + /* + * A zone is low on free memory, so wake its kswapd task to service it. + */ +-void wakeup_kswapd(struct zone *zone, int order) ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) + { + pg_data_t *pgdat; ++ int active; + + if (!populated_zone(zone)) + return; +@@ -1363,7 +1430,9 @@ + pgdat->kswapd_max_order = order; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; +- if (!waitqueue_active(&pgdat->kswapd_wait)) ++ active = waitqueue_active(&pgdat->kswapd_wait); ++ set_kswapd_nice(pgdat->kswapd, p, active); ++ if (!active) + return; + wake_up_interruptible(&pgdat->kswapd_wait); + } +@@ -1382,6 +1451,8 @@ + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + ++ delay_swap_prefetch(); ++ + for_each_zone(zone) { + + if (!populated_zone(zone)) +@@ -1441,7 +1512,7 @@ + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + + current->reclaim_state = &reclaim_state; +@@ -1476,7 +1547,7 @@ + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; +- sc.swappiness = 100; ++ sc.mapped = 0; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { +@@ -1540,20 +1611,57 @@ + } + + /* ++ * We wake up kswapd every WT_EXPIRY till free ram is above pages_lots ++ */ ++static void watermark_wakeup(unsigned long data) ++{ ++ pg_data_t *pgdat = (pg_data_t *)data; ++ struct timer_list *wt = &pgdat->watermark_timer; ++ int i; ++ ++ if (!waitqueue_active(&pgdat->kswapd_wait) || above_background_load()) ++ goto out; ++ for (i = pgdat->nr_zones - 1; i >= 0; i--) { ++ struct zone *z = pgdat->node_zones + i; ++ ++ if (!populated_zone(z) || is_highmem(z)) { ++ /* We are better off leaving highmem full */ ++ continue; ++ } ++ if (!zone_watermark_ok(z, 0, z->pages_lots, 0, 0)) { ++ wake_up_interruptible(&pgdat->kswapd_wait); ++ goto out; ++ } ++ } ++out: ++ mod_timer(wt, jiffies + WT_EXPIRY); ++ return; ++} ++ ++/* + * This kswapd start function will be called by init and node-hot-add. + * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. + */ + int kswapd_run(int nid) + { + pg_data_t *pgdat = NODE_DATA(nid); ++ struct timer_list *wt; + int ret = 0; + + if (pgdat->kswapd) + return 0; + ++ wt = &pgdat->watermark_timer; ++ init_timer(wt); ++ wt->data = (unsigned long)pgdat; ++ wt->function = watermark_wakeup; ++ wt->expires = jiffies + WT_EXPIRY; ++ add_timer(wt); ++ + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ ++ del_timer(wt); + BUG_ON(system_state == SYSTEM_BOOTING); + printk("Failed to start kswapd on node %d\n",nid); + ret = -1; +@@ -1624,7 +1732,7 @@ + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, +- .swappiness = vm_swappiness, ++ .mapped = vm_mapped, + }; + unsigned long slab_reclaimable; + +Index: linux-2.6.22-ck1/include/linux/mm_inline.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/mm_inline.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/mm_inline.h 2007-07-10 14:55:22.000000000 +1000 +@@ -13,6 +13,13 @@ + } + + static inline void ++add_page_to_inactive_list_tail(struct zone *zone, struct page *page) ++{ ++ list_add_tail(&page->lru, &zone->inactive_list); ++ __inc_zone_state(zone, NR_INACTIVE); ++} ++ ++static inline void + del_page_from_active_list(struct zone *zone, struct page *page) + { + list_del(&page->lru); +Index: linux-2.6.22-ck1/include/linux/swap-prefetch.h +=================================================================== +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ linux-2.6.22-ck1/include/linux/swap-prefetch.h 2007-07-10 14:55:22.000000000 +1000 +@@ -0,0 +1,53 @@ ++#ifndef SWAP_PREFETCH_H_INCLUDED ++#define SWAP_PREFETCH_H_INCLUDED ++ ++#ifdef CONFIG_SWAP_PREFETCH ++/* mm/swap_prefetch.c */ ++extern int swap_prefetch; ++extern int swap_prefetch_delay; ++extern int swap_prefetch_sleep; ++ ++struct swapped_entry { ++ swp_entry_t swp_entry; /* The actual swap entry */ ++ struct list_head swapped_list; /* Linked list of entries */ ++#if MAX_NUMNODES > 1 ++ int node; /* Node id */ ++#endif ++} __attribute__((packed)); ++ ++static inline void store_swap_entry_node(struct swapped_entry *entry, ++ struct page *page) ++{ ++#if MAX_NUMNODES > 1 ++ entry->node = page_to_nid(page); ++#endif ++} ++ ++static inline int get_swap_entry_node(struct swapped_entry *entry) ++{ ++#if MAX_NUMNODES > 1 ++ return entry->node; ++#else ++ return 0; ++#endif ++} ++ ++extern void add_to_swapped_list(struct page *page); ++extern void delay_swap_prefetch(void); ++extern void prepare_swap_prefetch(void); ++ ++#else /* CONFIG_SWAP_PREFETCH */ ++static inline void add_to_swapped_list(struct page *__unused) ++{ ++} ++ ++static inline void prepare_swap_prefetch(void) ++{ ++} ++ ++static inline void delay_swap_prefetch(void) ++{ ++} ++#endif /* CONFIG_SWAP_PREFETCH */ ++ ++#endif /* SWAP_PREFETCH_H_INCLUDED */ +Index: linux-2.6.22-ck1/mm/page_io.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/page_io.c 2007-07-10 14:55:00.000000000 +1000 ++++ linux-2.6.22-ck1/mm/page_io.c 2007-07-10 14:55:22.000000000 +1000 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + + static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, +@@ -118,6 +119,7 @@ + ret = -ENOMEM; + goto out; + } ++ add_to_swapped_list(page); + if (wbc->sync_mode == WB_SYNC_ALL) + rw |= (1 << BIO_RW_SYNC); + count_vm_event(PSWPOUT); +Index: linux-2.6.22-ck1/include/linux/sysctl.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/sysctl.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/sysctl.h 2007-07-10 14:55:22.000000000 +1000 +@@ -190,7 +190,7 @@ + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ +- VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ ++ VM_UNUSED19=19, /* was: Tendency to steal mapped memory */ + VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */ + VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ + VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ +Index: linux-2.6.22-ck1/include/linux/mmzone.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/mmzone.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/mmzone.h 2007-07-10 14:55:23.000000000 +1000 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -181,7 +182,7 @@ + + struct zone { + /* Fields commonly accessed by the page allocator */ +- unsigned long pages_min, pages_low, pages_high; ++ unsigned long pages_min, pages_low, pages_high, pages_lots; + /* + * We don't know if the memory that we're going to allocate will be freeable + * or/and it will be released eventually, so to avoid totally wasting several +@@ -452,6 +453,7 @@ + wait_queue_head_t kswapd_wait; + struct task_struct *kswapd; + int kswapd_max_order; ++ struct timer_list watermark_timer; + } pg_data_t; + + #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) +@@ -468,7 +470,7 @@ + void get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free); + void build_all_zonelists(void); +-void wakeup_kswapd(struct zone *zone, int order); ++void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); + int zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); + enum memmap_context { +Index: linux-2.6.22-ck1/mm/page_alloc.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/page_alloc.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/mm/page_alloc.c 2007-07-10 14:55:22.000000000 +1000 +@@ -1250,7 +1250,7 @@ + goto nopage; + + for (z = zonelist->zones; *z; z++) +- wakeup_kswapd(*z, order); ++ wakeup_kswapd(*z, order, p); + + /* + * OK, we're below the kswapd watermark and have kicked background +@@ -1314,7 +1314,7 @@ + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + +- did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); ++ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask, p); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; +@@ -1570,6 +1570,7 @@ + " min:%lukB" + " low:%lukB" + " high:%lukB" ++ " lots:%lukB" + " active:%lukB" + " inactive:%lukB" + " present:%lukB" +@@ -1581,6 +1582,7 @@ + K(zone->pages_min), + K(zone->pages_low), + K(zone->pages_high), ++ K(zone->pages_lots), + K(zone_page_state(zone, NR_ACTIVE)), + K(zone_page_state(zone, NR_INACTIVE)), + K(zone->present_pages), +@@ -3142,6 +3144,7 @@ + + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); ++ zone->pages_lots = zone->pages_min + tmp; + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + +Index: linux-2.6.22-ck1/fs/buffer.c +=================================================================== +--- linux-2.6.22-ck1.orig/fs/buffer.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/fs/buffer.c 2007-07-10 14:55:22.000000000 +1000 +@@ -356,7 +356,7 @@ + for_each_online_pgdat(pgdat) { + zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; + if (*zones) +- try_to_free_pages(zones, GFP_NOFS); ++ try_to_free_pages(zones, GFP_NOFS, NULL); + } + } + +Index: linux-2.6.22-ck1/mm/filemap.c +=================================================================== +--- linux-2.6.22-ck1.orig/mm/filemap.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/mm/filemap.c 2007-07-10 14:55:23.000000000 +1000 +@@ -466,6 +466,16 @@ + return ret; + } + ++int add_to_page_cache_lru_tail(struct page *page, ++ struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache(page, mapping, offset, gfp_mask); ++ ++ if (ret == 0) ++ lru_cache_add_tail(page); ++ return ret; ++} ++ + #ifdef CONFIG_NUMA + struct page *__page_cache_alloc(gfp_t gfp) + { +@@ -839,6 +849,34 @@ + ra->ra_pages /= 4; + } + ++/* ++ * Sysctl which determines whether we should read from large files to the ++ * tail of the inactive lru list. ++ */ ++int vm_tail_largefiles __read_mostly = 1; ++ ++static inline int nr_mapped(void) ++{ ++ return global_page_state(NR_FILE_MAPPED) + ++ global_page_state(NR_ANON_PAGES); ++} ++ ++/* ++ * This examines how large in pages a file size is and returns 1 if it is ++ * more than half the unmapped ram. Avoid doing read_page_state which is ++ * expensive unless we already know it is likely to be large enough. ++ */ ++static int large_isize(unsigned long nr_pages) ++{ ++ if (nr_pages * 6 > vm_total_pages) { ++ unsigned long unmapped_ram = vm_total_pages - nr_mapped(); ++ ++ if (nr_pages * 2 > unmapped_ram) ++ return 1; ++ } ++ return 0; ++} ++ + /** + * do_generic_mapping_read - generic file read routine + * @mapping: address_space to be read +@@ -1051,8 +1089,19 @@ + goto out; + } + } +- error = add_to_page_cache_lru(cached_page, mapping, +- index, GFP_KERNEL); ++ ++ /* ++ * If we know the file is large we add the pages read to the ++ * end of the lru as we're unlikely to be able to cache the ++ * whole file in ram so make those pages the first to be ++ * dropped if not referenced soon. ++ */ ++ if (vm_tail_largefiles && large_isize(end_index)) ++ error = add_to_page_cache_lru_tail(cached_page, ++ mapping, index, GFP_KERNEL); ++ else ++ error = add_to_page_cache_lru(cached_page, mapping, ++ index, GFP_KERNEL); + if (error) { + if (error == -EEXIST) + goto find_page; +Index: linux-2.6.22-ck1/Documentation/filesystems/proc.txt +=================================================================== +--- linux-2.6.22-ck1.orig/Documentation/filesystems/proc.txt 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/Documentation/filesystems/proc.txt 2007-07-10 14:55:23.000000000 +1000 +@@ -1333,6 +1333,14 @@ + As this is a non-destructive operation and dirty objects are not freeable, the + user should run `sync' first. + ++tail_largefiles ++--------------- ++ ++When enabled reads from large files to the tail end of the inactive lru list. ++This means that any cache from reading large files is dropped very quickly, ++preventing loss of mapped ram and useful pagecache when large files are read. ++This does, however, make caching less effective when working with large files. ++ + + 2.5 /proc/sys/dev - Device specific parameters + ---------------------------------------------- +Index: linux-2.6.22-ck1/arch/i386/Kconfig +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/Kconfig 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/Kconfig 2007-07-10 14:55:23.000000000 +1000 +@@ -550,7 +550,7 @@ + + choice + depends on EXPERIMENTAL +- prompt "Memory split" if EMBEDDED ++ prompt "Memory split" + default VMSPLIT_3G + help + Select the desired split between kernel and user memory. +@@ -569,17 +569,17 @@ + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !HIGHMEM +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !HIGHMEM +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +Index: linux-2.6.22-ck1/kernel/Kconfig.hz +=================================================================== +--- linux-2.6.22-ck1.orig/kernel/Kconfig.hz 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/kernel/Kconfig.hz 2007-07-10 14:55:24.000000000 +1000 +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_1000 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -13,8 +13,7 @@ + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts +- per second. +- ++ per second.Laptops may also show improved battery life. + + config HZ_100 + bool "100 HZ" +@@ -23,13 +22,14 @@ + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEFAULT + bool "250 HZ" + help +- 250 Hz is a good compromise choice allowing server performance +- while also showing good interactive responsiveness even +- on SMP and NUMA systems. If you are going to be using NTSC video +- or multimedia, selected 300Hz instead. ++ 250 HZ is a lousy compromise choice allowing server interactivity ++ while also showing desktop throughput and no extra power saving on ++ laptops. Good for when you can't make up your mind. ++ ++ Recommend 100 or 1000 instead. + + config HZ_300 + bool "300 HZ" +@@ -45,12 +45,76 @@ + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + ++ config HZ_1500 ++ bool "1500 HZ" ++ help ++ 1500 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_2000 ++ bool "2000 HZ" ++ help ++ 2000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_3000 ++ bool "3000 HZ" ++ help ++ 3000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_4000 ++ bool "4000 HZ" ++ help ++ 4000 Hz is an insane value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_5000 ++ bool "5000 HZ" ++ help ++ 5000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_7500 ++ bool "7500 HZ" ++ help ++ 7500 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ config HZ_10000 ++ bool "10000 HZ" ++ help ++ 10000 Hz is an obscene value to use to run broken software that is Hz ++ limited. ++ ++ Being over 1000, driver breakage is likely. ++ ++ + endchoice + + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 ++ default 250 if HZ_250_NODEFAULT + default 300 if HZ_300 + default 1000 if HZ_1000 ++ default 1500 if HZ_1500 ++ default 2000 if HZ_2000 ++ default 3000 if HZ_3000 ++ default 4000 if HZ_4000 ++ default 5000 if HZ_5000 ++ default 7500 if HZ_7500 ++ default 10000 if HZ_10000 + +Index: linux-2.6.22-ck1/arch/i386/defconfig +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/defconfig 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/defconfig 2007-07-10 14:55:23.000000000 +1000 +@@ -226,10 +226,10 @@ + # CONFIG_IRQBALANCE is not set + CONFIG_SECCOMP=y + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + # CONFIG_KEXEC is not set + # CONFIG_CRASH_DUMP is not set + CONFIG_PHYSICAL_START=0x100000 +Index: linux-2.6.22-ck1/arch/x86_64/defconfig +=================================================================== +--- linux-2.6.22-ck1.orig/arch/x86_64/defconfig 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/x86_64/defconfig 2007-07-10 14:55:23.000000000 +1000 +@@ -185,10 +185,10 @@ + CONFIG_SECCOMP=y + # CONFIG_CC_STACKPROTECTOR is not set + # CONFIG_HZ_100 is not set +-CONFIG_HZ_250=y ++# CONFIG_HZ_250 is not set + # CONFIG_HZ_300 is not set +-# CONFIG_HZ_1000 is not set +-CONFIG_HZ=250 ++CONFIG_HZ_1000=y ++CONFIG_HZ=1000 + CONFIG_K8_NB=y + CONFIG_GENERIC_HARDIRQS=y + CONFIG_GENERIC_IRQ_PROBE=y +Index: linux-2.6.22-ck1/include/linux/jiffies.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/jiffies.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/jiffies.h 2007-07-10 14:55:24.000000000 +1000 +@@ -29,6 +29,12 @@ + # define SHIFT_HZ 9 + #elif HZ >= 768 && HZ < 1536 + # define SHIFT_HZ 10 ++#elif HZ >= 1536 && HZ < 3072 ++# define SHIFT_HZ 11 ++#elif HZ >= 3072 && HZ < 6144 ++# define SHIFT_HZ 12 ++#elif HZ >= 6144 && HZ < 12288 ++# define SHIFT_HZ 13 + #else + # error You lose. + #endif +Index: linux-2.6.22-ck1/include/net/inet_timewait_sock.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/net/inet_timewait_sock.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/net/inet_timewait_sock.h 2007-07-10 14:55:24.000000000 +1000 +@@ -38,8 +38,8 @@ + * If time > 4sec, it is "slow" path, no recycling is required, + * so that we select tick to get range about 4 seconds. + */ +-#if HZ <= 16 || HZ > 4096 +-# error Unsupported: HZ <= 16 or HZ > 4096 ++#if HZ <= 16 || HZ > 16384 ++# error Unsupported: HZ <= 16 or HZ > 16384 + #elif HZ <= 32 + # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 64 +@@ -54,8 +54,12 @@ + # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #elif HZ <= 2048 + # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) +-#else ++#elif HZ <= 4096 + # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#elif HZ <= 8192 ++# define INET_TWDR_RECYCLE_TICK (13 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) ++#else ++# define INET_TWDR_RECYCLE_TICK (14 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG) + #endif + + /* TIME_WAIT reaping mechanism. */ +Index: linux-2.6.22-ck1/init/calibrate.c +=================================================================== +--- linux-2.6.22-ck1.orig/init/calibrate.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/init/calibrate.c 2007-07-10 14:55:24.000000000 +1000 +@@ -122,12 +122,12 @@ + printk("Calibrating delay loop (skipped)... " + "%lu.%02lu BogoMIPS preset\n", + loops_per_jiffy/(500000/HZ), +- (loops_per_jiffy/(5000/HZ)) % 100); ++ (loops_per_jiffy * 10/(50000/HZ)) % 100); + } else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) { + printk("Calibrating delay using timer specific routine.. "); + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), +- (loops_per_jiffy/(5000/HZ)) % 100, ++ (loops_per_jiffy * 10/(50000/HZ)) % 100, + loops_per_jiffy); + } else { + loops_per_jiffy = (1<<12); +@@ -166,7 +166,7 @@ + /* Round the value and print it */ + printk("%lu.%02lu BogoMIPS (lpj=%lu)\n", + loops_per_jiffy/(500000/HZ), +- (loops_per_jiffy/(5000/HZ)) % 100, ++ (loops_per_jiffy * 10/(50000/HZ)) % 100, + loops_per_jiffy); + } + +Index: linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/kernel/cpu/proc.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/kernel/cpu/proc.c 2007-07-10 14:55:24.000000000 +1000 +@@ -157,7 +157,7 @@ + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); + + return 0; +Index: linux-2.6.22-ck1/arch/i386/kernel/smpboot.c +=================================================================== +--- linux-2.6.22-ck1.orig/arch/i386/kernel/smpboot.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/i386/kernel/smpboot.c 2007-07-10 14:55:24.000000000 +1000 +@@ -1094,7 +1094,7 @@ + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), +- (bogosum/(5000/HZ))%100); ++ (bogosum * 10/(50000/HZ))%100); + + Dprintk("Before bogocount - setting activated=1.\n"); + +Index: linux-2.6.22-ck1/include/linux/nfsd/stats.h +=================================================================== +--- linux-2.6.22-ck1.orig/include/linux/nfsd/stats.h 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/include/linux/nfsd/stats.h 2007-07-10 14:55:24.000000000 +1000 +@@ -35,8 +35,8 @@ + + }; + +-/* thread usage wraps very million seconds (approx one fortnight) */ +-#define NFSD_USAGE_WRAP (HZ*1000000) ++/* thread usage wraps every one hundred thousand seconds (approx one day) */ ++#define NFSD_USAGE_WRAP (HZ*100000) + + #ifdef __KERNEL__ + +Index: linux-2.6.22-ck1/arch/x86_64/kernel/setup.c +=================================================================== +--- linux-2.6.22-ck1.orig/arch/x86_64/kernel/setup.c 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/arch/x86_64/kernel/setup.c 2007-07-10 14:55:24.000000000 +1000 +@@ -1047,7 +1047,7 @@ + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), +- (c->loops_per_jiffy/(5000/HZ)) % 100); ++ (c->loops_per_jiffy * 10/(50000/HZ)) % 100); + + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); +Index: linux-2.6.22-ck1/Makefile +=================================================================== +--- linux-2.6.22-ck1.orig/Makefile 2007-07-10 14:54:59.000000000 +1000 ++++ linux-2.6.22-ck1/Makefile 2007-07-10 14:55:24.000000000 +1000 +@@ -1,8 +1,9 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 22 +-EXTRAVERSION = +-NAME = Holy Dancing Manatees, Batman! ++EXTRAVERSION = -ck1 ++NAME = So long, and thanks for all the fish ++JANAME = さようなら、いままで魚をありがとう + + # *DOCUMENTATION* + # To see a list of typical targets execute "make help"