From bba0e4670a4e1841a96b561dcc60ebe335049891 Mon Sep 17 00:00:00 2001
From: Nigel Cunningham <ncunningham@cyclades.com>
Date: Wed, 27 Jul 2005 11:43:41 -0700
Subject: [PATCH] Address BUG: using smp_processor_id() in preemptible
 [00000001] code

This patch fixes a warning in the disable_nonboot_cpus call in
kernel/power/smp.c.

Signed-off by: Nigel Cunningham <nigel@suspend2.net>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index bbe23079c62c..911fc62b8225 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -38,7 +38,7 @@ void disable_nonboot_cpus(void)
 		}
 		printk("Error taking cpu %d down: %d\n", cpu, error);
 	}
-	BUG_ON(smp_processor_id() != 0);
+	BUG_ON(raw_smp_processor_id() != 0);
 	if (error)
 		panic("cpus not sleeping");
 }
-- 
cgit v1.2.1


From d912d1ff218195c248c770eb677726695e07aa40 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Wed, 27 Jul 2005 11:43:44 -0700
Subject: [PATCH] itimer fixes

Fix the recent off-by-one fix in the itimer code:

1. The repeating timer is figured using the requested time
	(not +1 as we know where we are in the jiffie).

2. The tests for interval too large are left to the time_val to jiffie code.

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/itimer.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index a72cb0e5aa4b..7c1b25e25e47 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
 	return error;
 }
 
-/*
- * Called with P->sighand->siglock held and P->signal->real_timer inactive.
- * If interval is nonzero, arm the timer for interval ticks from now.
- */
-static inline void it_real_arm(struct task_struct *p, unsigned long interval)
-{
-	p->signal->it_real_value = interval; /* XXX unnecessary field?? */
-	if (interval == 0)
-		return;
-	if (interval > (unsigned long) LONG_MAX)
-		interval = LONG_MAX;
-	/* the "+ 1" below makes sure that the timer doesn't go off before
-	 * the interval requested. This could happen if
-	 * time requested % (usecs per jiffy) is more than the usecs left
-	 * in the current jiffy */
-	p->signal->real_timer.expires = jiffies + interval + 1;
-	add_timer(&p->signal->real_timer);
-}
 
 void it_real_fn(unsigned long __data)
 {
 	struct task_struct * p = (struct task_struct *) __data;
+	unsigned long inc = p->signal->it_real_incr;
 
 	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
 
@@ -141,14 +124,23 @@ void it_real_fn(unsigned long __data)
 	 * Now restart the timer if necessary.  We don't need any locking
 	 * here because do_setitimer makes sure we have finished running
 	 * before it touches anything.
+	 * Note, we KNOW we are (or should be) at a jiffie edge here so
+	 * we don't need the +1 stuff.  Also, we want to use the prior
+	 * expire value so as to not "slip" a jiffie if we are late.
+	 * Deal with requesting a time prior to "now" here rather than
+	 * in add_timer.
 	 */
-	it_real_arm(p, p->signal->it_real_incr);
+	if (!inc)
+		return;
+	while (time_before_eq(p->signal->real_timer.expires, jiffies))
+		p->signal->real_timer.expires += inc;
+	add_timer(&p->signal->real_timer);
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval;
+ 	unsigned long val, interval, expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
@@ -164,7 +156,10 @@ again:
 		}
 		tsk->signal->it_real_incr =
 			timeval_to_jiffies(&value->it_interval);
-		it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+		expires = timeval_to_jiffies(&value->it_value);
+		if (expires)
+			mod_timer(&tsk->signal->real_timer,
+				  jiffies + 1 + expires);
 		spin_unlock_irq(&tsk->sighand->siglock);
 		if (ovalue) {
 			jiffies_to_timeval(val, &ovalue->it_value);
-- 
cgit v1.2.1


From 951f22d5b1f0eaae35dafc669e3774a0c2084d10 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 27 Jul 2005 11:44:57 -0700
Subject: [PATCH] s390: spin lock retry

Split spin lock and r/w lock implementation into a single try which is done
inline and an out of line function that repeatedly tries to get the lock
before doing the cpu_relax().  Add a system control to set the number of
retries before a cpu is yielded.

The reason for the spin lock retry is that the diagnose 0x44 that is used to
give up the virtual cpu is quite expensive.  For spin locks that are held only
for a short period of time the costs of the diagnoses outweights the savings
for spin locks that are held for a longer timer.  The default retry count is
1000.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e60b9c36f1f0..3e0bbee549ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -114,6 +114,7 @@ extern int unaligned_enabled;
 extern int sysctl_ieee_emulation_warnings;
 #endif
 extern int sysctl_userprocess_debug;
+extern int spin_retry;
 #endif
 
 extern int sysctl_hz_timer;
@@ -647,7 +648,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-
+#if defined(CONFIG_ARCH_S390)
+	{
+		.ctl_name	= KERN_SPIN_RETRY,
+		.procname	= "spin_retry",
+		.data		= &spin_retry,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.1


From 207a7ba8dc000e1b13acac97f3736810dd86e8e2 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 27 Jul 2005 11:45:10 -0700
Subject: [PATCH] kernel/capability.c: add kerneldoc

Add kerneldoc to kernel/capability.c

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/capability.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 64db1ee820c2..8986a37a67ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock);
  * uninteresting and/or not to be changed.
  */
 
-/*
+/**
  * sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
  */
 asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
 {
@@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective,
      return ret;
 }
 
-/*
- * sys_capset - set capabilities for a given process, all processes, or all
+/**
+ * sys_capset - set capabilities for a process or a group of processes
+ * @header: pointer to struct that contains capability version and
+ *	target pid data
+ * @data: pointer to struct that contains the effective, permitted,
+ *	and inheritable capabilities
+ *
+ * Set capabilities for a given process, all processes, or all
  * processes in a given process group.
  *
  * The restrictions on setting capabilities are specified as:
@@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
  * I: any raised capabilities must be a subset of the (old current) permitted
  * P: any raised capabilities must be a subset of the (old current) permitted
  * E: must be set to a subset of (new target) permitted
+ *
+ * Returns 0 on success and < 0 on error.
  */
 asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
 {
-- 
cgit v1.2.1


From d9fd8a6d443b509147280f058d4e59f0b796a323 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 27 Jul 2005 11:45:11 -0700
Subject: [PATCH] kernel/cpuset.c: add kerneldoc, fix typos

Add kerneldoc to kernel/cpuset.c

Fix cpuset typos in init/Kconfig

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 984c0bf3807f..805fb9097318 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1440,10 +1440,10 @@ void __init cpuset_init_smp(void)
 
 /**
  * cpuset_fork - attach newly forked task to its parents cpuset.
- * @p: pointer to task_struct of forking parent process.
+ * @tsk: pointer to task_struct of forking parent process.
  *
  * Description: By default, on fork, a task inherits its
- * parents cpuset.  The pointer to the shared cpuset is
+ * parent's cpuset.  The pointer to the shared cpuset is
  * automatically copied in fork.c by dup_task_struct().
  * This cpuset_fork() routine need only increment the usage
  * counter in that cpuset.
@@ -1471,7 +1471,6 @@ void cpuset_fork(struct task_struct *tsk)
  * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
  * then a zero cpuset use count is a license to any other task to
  * nuke the cpuset immediately.
- *
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1521,7 +1520,9 @@ void cpuset_init_current_mems_allowed(void)
 	current->mems_allowed = NODE_MASK_ALL;
 }
 
-/*
+/**
+ * cpuset_update_current_mems_allowed - update mems parameters to new values
+ *
  * If the current tasks cpusets mems_allowed changed behind our backs,
  * update current->mems_allowed and mems_generation to the new value.
  * Do not call this routine if in_interrupt().
@@ -1540,13 +1541,20 @@ void cpuset_update_current_mems_allowed(void)
 	}
 }
 
+/**
+ * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
+ * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
+ */
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
 {
 	bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
 							MAX_NUMNODES);
 }
 
-/*
+/**
+ * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
+ * @zl: the zonelist to be checked
+ *
  * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
  */
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
@@ -1562,8 +1570,12 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 0;
 }
 
-/*
- * Is 'current' valid, and is zone z allowed in current->mems_allowed?
+/**
+ * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
+ * @z: zone in question
+ *
+ * Is zone z allowed in current->mems_allowed, or is
+ * the CPU in interrupt context? (zone is always allowed in this case)
  */
 int cpuset_zone_allowed(struct zone *z)
 {
-- 
cgit v1.2.1


From e77e17161ccb8bd877bf83b3611cd318e451c605 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 27 Jul 2005 11:45:11 -0700
Subject: [PATCH] kernel/crash_dump.c: add kerneldoc

Add kerneldoc to kernel/crash_dump.c

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/crash_dump.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 459ba49e376a..334c37f5218a 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,7 +18,16 @@
 /* Stores the physical address of elf header of crash image. */
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
-/*
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *	space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *	otherwise @buf is in kernel address space, use memcpy().
+ *
  * Copy a page from "oldmem". For this page, there is no pte mapped
  * in the current kernel. We stitch up a pte, similar to kmap_atomic.
  */
-- 
cgit v1.2.1


From 77933d7276ee8fa0e2947641941a6f7a100a327b Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl@dif.dk>
Date: Wed, 27 Jul 2005 11:46:09 -0700
Subject: [PATCH] clean up inline static vs static inline

`gcc -W' likes to complain if the static keyword is not at the beginning of
the declaration.  This patch fixes all remaining occurrences of "inline
static" up with "static inline" in the entire kernel tree (140 occurrences in
47 files).

While making this change I came across a few lines with trailing whitespace
that I also fixed up, I have also added or removed a blank line or two here
and there, but there are no functional changes in the patch.

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index d4335c1c884c..dd5ae1162a8f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
  * as real UNIX machines always do it. This avoids all headaches about
  * daylight saving times and warping kernel clocks.
  */
-inline static void warp_clock(void)
+static inline void warp_clock(void)
 {
 	write_seqlock_irq(&xtime_lock);
 	wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
-- 
cgit v1.2.1


From e4ff4d7f9d85a2bc714307eb9113617182e62845 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 27 Jul 2005 10:41:23 -0700
Subject: [PATCH] Avoid device suspend on reboot

My fairly ordinary x86 test box gets stuck during reboot on the
wait_for_completion() in ide_do_drive_cmd():

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index a74039036fb4..8f255259ef9e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -371,7 +371,6 @@ void kernel_restart(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
-	device_suspend(PMSG_FREEZE);
 	device_shutdown();
 	if (!cmd) {
 		printk(KERN_EMERG "Restarting system.\n");
-- 
cgit v1.2.1


From ed6b676ca8b50e0b538e61c283d52fd04f007abf Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 28 Jul 2005 21:15:49 -0700
Subject: [PATCH] x86_64: Switch to the interrupt stack when running a softirq
 in local_bh_enable()

This avoids some potential stack overflows with very deep softirq callchains.
i386 does this too.

TOADD CFI annotation

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..31007d6542cc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -86,7 +86,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	local_softirq_pending() = 0;
 
-	local_irq_enable();
+	//local_irq_enable();
 
 	h = softirq_vec;
 
@@ -99,7 +99,7 @@ restart:
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
+	//local_irq_disable();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
-- 
cgit v1.2.1


From 78fa74a23b16bdb0d944272b696915c4e0bb3ee1 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Thu, 28 Jul 2005 21:16:16 -0700
Subject: [PATCH] posix timers: fix normalization problem

(We found this (after a customer complained) and it is in the kernel.org
kernel.  Seems that for CLOCK_MONOTONIC absolute timers and clock_nanosleep
calls both the request time and wall_to_monotonic are subtracted prior to
the normalize resulting in an overflow in the existing normalize test.
This causes the result to be shifted ~4 seconds ahead instead of ~2 seconds
back in time.)

The normalize code in posix-timers.c fails when the tv_nsec member is ~1.2
seconds negative.  This can happen on absolute timers (and
clock_nanosleeps) requested on CLOCK_MONOTONIC (both the request time and
wall_to_monotonic are subtracted resulting in the possibility of a number
close to -2 seconds.)

This fix uses the set_normalized_timespec() (which does not have an
overflow problem) to fix the problem and as a side effect makes the code
cleaner.

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5b7b4736d82b..10b2ad749d14 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -896,21 +896,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
 			jiffies_64_f = get_jiffies_64();
 		}
 		/*
-		 * Take away now to get delta
+		 * Take away now to get delta and normalize
 		 */
-		oc.tv_sec -= now.tv_sec;
-		oc.tv_nsec -= now.tv_nsec;
-		/*
-		 * Normalize...
-		 */
-		while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
-			oc.tv_nsec -= NSEC_PER_SEC;
-			oc.tv_sec++;
-		}
-		while ((oc.tv_nsec) < 0) {
-			oc.tv_nsec += NSEC_PER_SEC;
-			oc.tv_sec--;
-		}
+		set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
+					oc.tv_nsec - now.tv_nsec);
 	}else{
 		jiffies_64_f = get_jiffies_64();
 	}
-- 
cgit v1.2.1


From 1108bae41e2ac596f46bc4cd8876b93063203d2b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 29 Jul 2005 12:50:57 -0600
Subject: [PATCH] reboot: remove device_suspend(PMSG_FREEZE) from kernel_kexec

If device_suspend(PMSG_FREEZE) is not ready to be called in
kernel_restart it is definitely not ready to be called in the even more
fickle kernel_kexec.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 8f255259ef9e..000e81ad2c1d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -392,7 +392,6 @@ void kernel_kexec(void)
 	}
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
 	system_state = SYSTEM_RESTART;
-	device_suspend(PMSG_FREEZE);
 	device_shutdown();
 	printk(KERN_EMERG "Starting new kernel\n");
 	machine_shutdown();
-- 
cgit v1.2.1


From c70f5d6610c601ea2ae4ae4e49f66c80801e895f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 30 Jul 2005 10:22:49 -0700
Subject: [PATCH] revert bogus softirq changes

This snuck in with an x86_64 change.  Thanks to Richard Purdie
<rpurdie@rpsys.net> for spotting it.

Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softirq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 31007d6542cc..b4ab6af1dea8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -86,7 +86,7 @@ restart:
 	/* Reset the pending bitmask before enabling irqs */
 	local_softirq_pending() = 0;
 
-	//local_irq_enable();
+	local_irq_enable();
 
 	h = softirq_vec;
 
@@ -99,7 +99,7 @@ restart:
 		pending >>= 1;
 	} while (pending);
 
-	//local_irq_disable();
+	local_irq_disable();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
-- 
cgit v1.2.1


From 6cb54819d7b1867053e2dfd8c0ca3a8dc65a7eff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 1 Aug 2005 13:39:13 +0200
Subject: [PATCH] remove sys_set_zone_reclaim()

This removes sys_set_zone_reclaim() for now.  While i'm sure Martin is
trying to solve a real problem, we must not hard-code an incomplete and
insufficient approach into a syscall, because syscalls are pretty much
for eternity.  I am quite strongly convinced that this syscall must not
hit v2.6.13 in its current form.

Firstly, the syscall lacks basic syscall design: e.g. it allows the
global setting of VM policy for unprivileged users. (!) [ Imagine an
Oracle installation and a SAP installation on the same NUMA box fighting
over the 'optimal' setting for this flag. What will they do? Will they
try to set the flag to their own preferred value every second or so? ]

Secondly, it was added based on a single datapoint from Martin:

 http://marc.theaimsgroup.com/?l=linux-mm&m=111763597218177&w=2

where Martin characterizes the numbers the following way:

 ' Run-to-run variability for "make -j" is huge, so these numbers aren't
   terribly useful except to see that with reclaim the benchmark still
   finishes in a reasonable amount of time. '

in other words: the fundamental problem has likely not been solved, only
a tendential move into the right direction has been observed, and a
handful of numbers were picked out of a set of hugely variable results,
without showing the variability data. How much variance is there
run-to-run?

I'd really suggest to first walk the walk and see what's needed to get
stable & predictable kernel compilation numbers on that NUMA box, before
adding random syscalls to tune a particular aspect of the VM ... which
approach might not even matter once the whole picture has been analyzed
and understood!

The third, most important point is that the syscall exposes VM tuning
internals in a completely unstructured way. What sense does it make to
have a _GLOBAL_ per-node setting for 'should we go to another node for
reclaim'? If then it might make sense to do this per-app, via numalib or
so.

The change is minimalistic in that it doesnt remove the syscall and the
underlying infrastructure changes, only the user-visible changes.  We
could perhaps add a CAP_SYS_ADMIN-only sysctl for this hack, a'ka
/proc/sys/vm/swappiness, but even that looks quite counterproductive
when the generic approach is that we are trying to reduce the number of
external factors in the VM balance picture.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys_ni.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 42b40ae5eada..1ab2370e2efa 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -79,7 +79,6 @@ cond_syscall(sys_request_key);
 cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
-cond_syscall(sys_set_zone_reclaim);
 cond_syscall(sys_inotify_init);
 cond_syscall(sys_inotify_add_watch);
 cond_syscall(sys_inotify_rm_watch);
-- 
cgit v1.2.1


From 842bbaaa7394820c8f1fe0629cd15478653caf86 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 1 Aug 2005 21:11:47 -0700
Subject: [PATCH] Module per-cpu alignment cannot always be met

The module code assumes noone will ever ask for a per-cpu area more than
SMP_CACHE_BYTES aligned.  However, as these cases show, gcc asks sometimes
asks for 32-byte alignment for the per-cpu section on a module, and if
CONFIG_X86_L1_CACHE_SHIFT is 4, we hit that BUG_ON().  This is obviously an
unusual combination, as there have been few reports, but better to warn
than die.

See:
	http://www.ussg.iu.edu/hypermail/linux/kernel/0409.0/0768.html

And more recently:
	http://bugs.gentoo.org/show_bug.cgi?id=97006

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 068e271ab3a5..c32995fbd8fd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -250,13 +250,18 @@ static inline unsigned int block_size(int val)
 /* Created by linker magic */
 extern char __per_cpu_start[], __per_cpu_end[];
 
-static void *percpu_modalloc(unsigned long size, unsigned long align)
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+			     const char *name)
 {
 	unsigned long extra;
 	unsigned int i;
 	void *ptr;
 
-	BUG_ON(align > SMP_CACHE_BYTES);
+	if (align > SMP_CACHE_BYTES) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
+		       name, align, SMP_CACHE_BYTES);
+		align = SMP_CACHE_BYTES;
+	}
 
 	ptr = __per_cpu_start;
 	for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -348,7 +353,8 @@ static int percpu_modinit(void)
 }	
 __initcall(percpu_modinit);
 #else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align)
+static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+				    const char *name)
 {
 	return NULL;
 }
@@ -1644,7 +1650,8 @@ static struct module *load_module(void __user *umod,
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
-					 sechdrs[pcpuindex].sh_addralign);
+					 sechdrs[pcpuindex].sh_addralign,
+					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
 			goto free_mod;
-- 
cgit v1.2.1


From c36f19e02a96488f550fdb678c92500afca3109b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 4 Aug 2005 11:36:26 +0200
Subject: [PATCH] Remove suspend() calls from shutdown path

This removes the calls to device_suspend() from the shutdown path that
were added sometime during 2.6.13-rc*.  They aren't working properly on
a number of configs (I got reports from both ppc powerbook users and x86
users) causing the system to not shutdown anymore.

I think it isn't the right approach at the moment anyway.  We have
already a shutdown() callback for the drivers that actually care about
shutdown and the suspend() code isn't yet in a good enough shape to be
so much generalized.  Also, the semantics of suspend and shutdown are
slightly different on a number of setups and the way this was patched in
provides little way for drivers to cleanly differenciate.  It should
have been at least a different message.

For 2.6.13, I think we should revert to 2.6.12 behaviour and have a
working suspend back.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 000e81ad2c1d..0bcaed6560ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -404,7 +404,6 @@ void kernel_halt(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
 	system_state = SYSTEM_HALT;
-	device_suspend(PMSG_SUSPEND);
 	device_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	machine_halt();
@@ -415,7 +414,6 @@ void kernel_power_off(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
 	system_state = SYSTEM_POWER_OFF;
-	device_suspend(PMSG_SUSPEND);
 	device_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
-- 
cgit v1.2.1


From c306895167c8384b88bc02945a0d226a04218fa5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 4 Aug 2005 16:49:32 -0700
Subject: [PATCH] revert "timer exit cleanup"

Revert this June 17 patch: it broke persistence of timers across execve().

Cc: Roland McGrath <roland@redhat.com>
Cc: george anzinger <george@mvista.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c         | 4 +++-
 kernel/posix-timers.c | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9d1b10ed0135..5b0fb9f09f21 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -829,8 +829,10 @@ fastcall NORET_TYPE void do_exit(long code)
 	acct_update_integrals(tsk);
 	update_mem_hiwater(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
-	if (group_dead)
+	if (group_dead) {
+ 		del_timer_sync(&tsk->signal->real_timer);
 		acct_process(code);
+	}
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 10b2ad749d14..38798a2ff994 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1166,7 +1166,6 @@ void exit_itimers(struct signal_struct *sig)
 		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
 	}
-	del_timer_sync(&sig->real_timer);
 }
 
 /*
-- 
cgit v1.2.1