From 4ce072f1faf29d24df4600f53db8cdd62d400a8f Mon Sep 17 00:00:00 2001
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Date: Fri, 29 Sep 2006 01:58:42 -0700
Subject: [PATCH] mm: fix a race condition under SMC + COW

Failing context is a multi threaded process context and the failing
sequence is as follows.

One thread T0 doing self modifying code on page X on processor P0 and
another thread T1 doing COW (breaking the COW setup as part of just
happened fork() in another thread T2) on the same page X on processor P1.
T0 doing SMC can endup modifying the new page Y (allocated by the T1 doing
COW on P1) but because of different I/D TLB's, P0 ITLB will not see the new
mapping till the flush TLB IPI from P1 is received.  During this interval,
if T0 executes the code created by SMC it can result in an app error (as
ITLB still points to old page X and endup executing the content in page X
rather than using the content in page Y).

Fix this issue by first clearing the PTE and flushing it, before updating
it with new entry.

Hugh sayeth:

  I was a bit sceptical, in the habit of thinking that Self Modifying Code
  must look such issues itself: but I guess there's nothing it can do to avoid
  this one.

  Fair enough, what you're changing it to is pretty much what powerpc and
  s390 were already doing, and is a more robust way of proceeding, consistent
  with how ptes are set everywhere else.

  The ptep_clear_flush is a bit heavy-handed (it's anxious to return the pte
  that was atomically cleared), but we'd have to wander through lots of arches
  to get the right minimal behaviour.  It'd also be nice to eliminate
  ptep_establish completely, now only used to define other macros/inlines: it
  always seemed obfuscation to me, what you've got there now is clearer.
  Let's put those cleanups on a TODO list.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 601159a46ab6..160f5b503ead 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1577,7 +1577,14 @@ gotten:
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		lazy_mmu_prot_update(entry);
-		ptep_establish(vma, address, page_table, entry);
+		/*
+		 * Clear the pte entry and flush it first, before updating the
+		 * pte with the new entry. This will avoid a race condition
+		 * seen in the presence of one thread doing SMC and another
+		 * thread doing COW.
+		 */
+		ptep_clear_flush(vma, address, page_table);
+		set_pte_at(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
 		lru_cache_add_active(new_page);
 		page_add_new_anon_rmap(new_page, vma, address);
-- 
cgit v1.2.3


From 79f5acf5d784492afe80723496624093079aed9c Mon Sep 17 00:00:00 2001
From: Adam Litke <agl@us.ibm.com>
Date: Fri, 29 Sep 2006 01:58:43 -0700
Subject: [PATCH] mm: make filemap_nopage use NOPAGE_SIGBUS

Don't open-code NOPAGE_SIGBUS.

Signed-off-by: Adam Litke <agl@us.ibm.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index afcdc72b5e90..3277f3b23524 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1471,7 +1471,7 @@ outside_data_content:
 	 * accessible..
 	 */
 	if (area->vm_mm == current->mm)
-		return NULL;
+		return NOPAGE_SIGBUS;
 	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
@@ -1496,7 +1496,7 @@ no_cached_page:
 	 */
 	if (error == -ENOMEM)
 		return NOPAGE_OOM;
-	return NULL;
+	return NOPAGE_SIGBUS;
 
 page_not_uptodate:
 	if (!did_readaround) {
@@ -1565,7 +1565,7 @@ page_not_uptodate:
 	 */
 	shrink_readahead_size_eio(file, ra);
 	page_cache_release(page);
-	return NULL;
+	return NOPAGE_SIGBUS;
 }
 EXPORT_SYMBOL(filemap_nopage);
 
-- 
cgit v1.2.3


From aa83aa40ed2ae113d9ee5529cdd9e8c0e5fabe61 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 29 Sep 2006 01:59:51 -0700
Subject: [PATCH] single bit flip detector

In cases where we detect a single bit has been flipped, we spew the usual
slab corruption message, which users instantly think is a kernel bug.  In a
lot of cases, single bit errors are down to bad memory, or other hardware
failure.

This patch adds an extra line to the slab debug messages in those cases, in
the hope that users will try memtest before they report a bug.

000: 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b
Single bit error detected. Possibly bad RAM. Run memtest86.

[akpm@osdl.org: cleanups]
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 792bfe320a8b..3dbd6f4e7477 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1683,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
+	unsigned char error = 0;
+	int bad_count = 0;
+
 	printk(KERN_ERR "%03x:", offset);
-	for (i = 0; i < limit; i++)
+	for (i = 0; i < limit; i++) {
+		if (data[offset + i] != POISON_FREE) {
+			error = data[offset + i];
+			bad_count++;
+		}
 		printk(" %02x", (unsigned char)data[offset + i]);
+	}
 	printk("\n");
+
+	if (bad_count == 1) {
+		error ^= POISON_FREE;
+		if (!(error & (error - 1))) {
+			printk(KERN_ERR "Single bit error detected. Probably "
+					"bad RAM.\n");
+#ifdef CONFIG_X86
+			printk(KERN_ERR "Run memtest86+ or a similar memory "
+					"test tool.\n");
+#else
+			printk(KERN_ERR "Run a memory test tool.\n");
+#endif
+		}
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From f400e198b2ed26ce55b22a1412ded0896e7516ac Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Date: Fri, 29 Sep 2006 02:00:07 -0700
Subject: [PATCH] pidspace: is_init()

This is an updated version of Eric Biederman's is_init() patch.
(http://lkml.org/lkml/2006/2/6/280).  It applies cleanly to 2.6.18-rc3 and
replaces a few more instances of ->pid == 1 with is_init().

Further, is_init() checks pid and thus removes dependency on Eric's other
patches for now.

Eric's original description:

	There are a lot of places in the kernel where we test for init
	because we give it special properties.  Most  significantly init
	must not die.  This results in code all over the kernel test
	->pid == 1.

	Introduce is_init to capture this case.

	With multiple pid spaces for all of the cases affected we are
	looking for only the first process on the system, not some other
	process that has pid == 1.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: <lxc-devel@lists.sourceforge.net>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/fault.c                |  2 +-
 arch/arm/mm/fault.c                  |  2 +-
 arch/arm26/mm/fault.c                |  2 +-
 arch/i386/lib/usercopy.c             |  2 +-
 arch/i386/mm/fault.c                 |  2 +-
 arch/ia64/mm/fault.c                 |  2 +-
 arch/m32r/mm/fault.c                 |  2 +-
 arch/m68k/mm/fault.c                 |  2 +-
 arch/mips/mm/fault.c                 |  2 +-
 arch/powerpc/mm/fault.c              |  2 +-
 arch/powerpc/platforms/pseries/ras.c |  2 +-
 arch/ppc/kernel/traps.c              |  2 +-
 arch/ppc/mm/fault.c                  |  2 +-
 arch/s390/mm/fault.c                 |  2 +-
 arch/sh/mm/fault.c                   |  2 +-
 arch/sh64/mm/fault.c                 |  6 +++---
 arch/um/kernel/trap.c                |  2 +-
 arch/x86_64/mm/fault.c               |  4 ++--
 arch/xtensa/mm/fault.c               |  2 +-
 drivers/char/sysrq.c                 |  2 +-
 include/linux/sched.h                | 10 ++++++++++
 kernel/capability.c                  |  2 +-
 kernel/cpuset.c                      |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kexec.c                       |  2 +-
 kernel/ptrace.c                      |  1 +
 kernel/sysctl.c                      |  2 +-
 mm/oom_kill.c                        |  2 +-
 security/commoncap.c                 |  2 +-
 29 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'mm')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 622dabd84680..8871529a34e2 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -193,7 +193,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 	/* We ran out of memory, or some other thing happened to us that
 	   made us unable to handle the page fault gracefully.  */
  out_of_memory:
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index a5b33ff3924e..5e658a874498 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -198,7 +198,7 @@ survive:
 		return fault;
 	}
 
-	if (tsk->pid != 1)
+	if (!is_init(tsk))
 		goto out;
 
 	/*
diff --git a/arch/arm26/mm/fault.c b/arch/arm26/mm/fault.c
index a7c4cc922095..a1f6d8a9cc32 100644
--- a/arch/arm26/mm/fault.c
+++ b/arch/arm26/mm/fault.c
@@ -185,7 +185,7 @@ survive:
 	}
 
 	fault = -3; /* out of memory */
-	if (tsk->pid != 1)
+	if (!is_init(tsk))
 		goto out;
 
 	/*
diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index efc7e7d5f4d0..08502fc6d0cb 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -739,7 +739,7 @@ survive:
 			retval = get_user_pages(current, current->mm,
 					(unsigned long )to, 1, 1, 0, &pg, NULL);
 
-			if (retval == -ENOMEM && current->pid == 1) {
+			if (retval == -ENOMEM && is_init(current)) {
 				up_read(&current->mm->mmap_sem);
 				blk_congestion_wait(WRITE, HZ/50);
 				goto survive;
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 50d8617391dd..2581575786c1 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -589,7 +589,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index d8b1b4ac7f26..59f3ab937615 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -280,7 +280,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 
   out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index dc18a33eefef..8d5f551b5754 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -299,7 +299,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 5e2d87c10c87..911f2ce3f53e 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -181,7 +181,7 @@ good_area:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index a4f8c45c4e8e..8423d8590779 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -171,7 +171,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 77953f41d754..e8fa50624b70 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -386,7 +386,7 @@ bad_area_nosemaphore:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 903115d67fdc..311ed1993fc0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -337,7 +337,7 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err)
 		   err->disposition == RTAS_DISP_NOT_RECOVERED &&
 		   err->target == RTAS_TARGET_MEMORY &&
 		   err->type == RTAS_TYPE_ECC_UNCORR &&
-		   !(current->pid == 0 || current->pid == 1)) {
+		   !(current->pid == 0 || is_init(current))) {
 		/* Kill off a user process with an ECC error */
 		printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n",
 		       current->pid);
diff --git a/arch/ppc/kernel/traps.c b/arch/ppc/kernel/traps.c
index d7a433049b48..aafc8e8893d1 100644
--- a/arch/ppc/kernel/traps.c
+++ b/arch/ppc/kernel/traps.c
@@ -119,7 +119,7 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 	 * generate the same exception over and over again and we get
 	 * nowhere.  Better to kill it and let the kernel panic.
 	 */
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		__sighandler_t handler;
 
 		spin_lock_irq(&current->sighand->siglock);
diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c
index bc776beb3136..465f451f3bc3 100644
--- a/arch/ppc/mm/fault.c
+++ b/arch/ppc/mm/fault.c
@@ -291,7 +291,7 @@ bad_area:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index f2b9a84dc2bf..9c3c19fe62fc 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -353,7 +353,7 @@ no_context:
 */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (tsk->pid == 1) {
+	if (is_init(tsk)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 507f28914706..68663b8f99ae 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -149,7 +149,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/sh64/mm/fault.c b/arch/sh64/mm/fault.c
index f08d0eaf6497..8e2f6c28b739 100644
--- a/arch/sh64/mm/fault.c
+++ b/arch/sh64/mm/fault.c
@@ -277,7 +277,7 @@ bad_area:
 			show_regs(regs);
 #endif
 		}
-		if (tsk->pid == 1) {
+		if (is_init(tsk)) {
 			panic("INIT had user mode bad_area\n");
 		}
 		tsk->thread.address = address;
@@ -319,14 +319,14 @@ no_context:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		panic("INIT out of memory\n");
 		yield();
 		goto survive;
 	}
 	printk("fault:Out of memory\n");
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 61a23fff4395..c7b195c7e51f 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -120,7 +120,7 @@ out_nosemaphore:
  * us unable to handle the page fault gracefully.
  */
 out_of_memory:
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		up_read(&mm->mmap_sem);
 		yield();
 		down_read(&mm->mmap_sem);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 9ba54cc2b5f6..3751b4788e28 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -244,7 +244,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 
 int unhandled_signal(struct task_struct *tsk, int sig)
 {
-	if (tsk->pid == 1)
+	if (is_init(tsk))
 		return 1;
 	if (tsk->ptrace & PT_PTRACED)
 		return 0;
@@ -580,7 +580,7 @@ no_context:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) { 
+	if (is_init(current)) {
 		yield();
 		goto again;
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index a945a33e85a1..dd0dbec2e57e 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -144,7 +144,7 @@ bad_area:
 	 */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (current->pid == 1) {
+	if (is_init(current)) {
 		yield();
 		down_read(&mm->mmap_sem);
 		goto survive;
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ee3ca8f1768e..0ad6cb081db4 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -208,7 +208,7 @@ static void send_sig_all(int sig)
 	struct task_struct *p;
 
 	for_each_process(p) {
-		if (p->mm && p->pid != 1)
+		if (p->mm && !is_init(p))
 			/* Not swapper, init nor kernel thread */
 			force_sig(sig, p);
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3696f2f7126d..ed2af8671589 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1033,6 +1033,16 @@ static inline int pid_alive(struct task_struct *p)
 	return p->pids[PIDTYPE_PID].pid != NULL;
 }
 
+/**
+ * is_init - check if a task structure is the first user space
+ *	     task the kernel created.
+ * @p: Task structure to be checked.
+ */
+static inline int is_init(struct task_struct *tsk)
+{
+	return tsk->pid == 1;
+}
+
 extern void free_task(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a97..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
      int found = 0;
 
      do_each_thread(g, target) {
-             if (target == current || target->pid == 1)
+             if (target == current || is_init(target))
                      continue;
              found = 1;
 	     if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1b32c2c04c15..584bb4e6c042 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
  * A cpuset can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cpusets is empty.  Since all
  * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
  * always has either children cpusets and/or using tasks.  So we don't
  * need a special hack to ensure that top_cpuset cannot be deleted.
  *
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b6fb054b25d..9961192d6055 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
 	do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
 		if (p == ignored_task
 				|| p->exit_state
-				|| p->real_parent->pid == 1)
+				|| is_init(p->real_parent))
 			continue;
 		if (process_group(p->real_parent) != pgrp
 			    && p->real_parent->signal->session == p->signal->session) {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337e..37cad75cf494 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
 
 int kexec_should_crash(struct task_struct *p)
 {
-	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+	if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
 		return 1;
 	return 0;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8aad0331d82e..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -440,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
 	child = find_task_by_pid(pid);
 	if (child)
 		get_task_struct(child);
+
 	read_unlock(&tasklist_lock);
 	if (!child)
 		return ERR_PTR(-ESRCH);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bfa7d117c54..9535a3839930 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1915,7 +1915,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
 		return -EPERM;
 	}
 
-	op = (current->pid == 1) ? OP_SET : OP_AND;
+	op = is_init(current) ? OP_SET : OP_AND;
 	return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
 				do_proc_dointvec_bset_conv,&op);
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index bada3d03119f..f3dd79c1c367 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -255,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
  */
 static void __oom_kill_task(struct task_struct *p, const char *message)
 {
-	if (p->pid == 1) {
+	if (is_init(p)) {
 		WARN_ON(1);
 		printk(KERN_WARNING "tried to kill init!\n");
 		return;
diff --git a/security/commoncap.c b/security/commoncap.c
index f50fc298cf80..5a5ef5ca7ea9 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -169,7 +169,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 	/* For init, we want to retain the capabilities set
 	 * in the init_task struct. Thus we skip the usual
 	 * capability rules */
-	if (current->pid != 1) {
+	if (!is_init(current)) {
 		current->cap_permitted = new_permitted;
 		current->cap_effective =
 		    cap_intersect (new_permitted, bprm->cap_effective);
-- 
cgit v1.2.3


From 55a101f8f71a3d3dbda7b5c77083ffe47552f831 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:10 -0700
Subject: [PATCH] kill PF_DEAD flag

After the previous change (->flags & PF_DEAD) <=> (->state == EXIT_DEAD), we
don't need PF_DEAD any longer.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  1 -
 kernel/exit.c         |  6 ++----
 kernel/sched.c        | 16 ++++++++--------
 mm/oom_kill.c         |  4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fbc69cc3923d..9763de334f09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1061,7 +1061,6 @@ static inline void put_task_struct(struct task_struct *t)
 					/* Not implemented yet, only for 486*/
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
-#define PF_DEAD		0x00000008	/* Dead */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3d759c98fb11..9dd5f1336da2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -953,10 +953,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/* PF_DEAD causes final put_task_struct after we schedule. */
 	preempt_disable();
-	BUG_ON(tsk->flags & PF_DEAD);
-	tsk->flags |= PF_DEAD;
+	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = EXIT_DEAD;
 
 	schedule();
@@ -972,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
-	
+
 	do_exit(code);
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e1646b044b69..a9405d7cc6ab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1755,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct mm_struct *mm = rq->prev_mm;
-	unsigned long prev_task_flags;
+	long prev_state;
 
 	rq->prev_mm = NULL;
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
-	 * calls schedule one last time. The schedule call will never return,
-	 * and the scheduled task must drop that reference.
-	 * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+	 * If a task dies, then it sets EXIT_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 * The test for EXIT_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
-	prev_task_flags = prev->flags;
+	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_task_flags & PF_DEAD)) {
+	if (unlikely(prev_state == EXIT_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
@@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->flags & PF_DEAD);
+	BUG_ON(p->state == EXIT_DEAD);
 
 	get_task_struct(p);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f3dd79c1c367..202f186a753a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -226,8 +226,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
 		if (releasing) {
-			/* PF_DEAD tasks have already released their mm */
-			if (p->flags & PF_DEAD)
+			/* TASK_DEAD tasks have already released their mm */
+			if (p->state == EXIT_DEAD)
 				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
-- 
cgit v1.2.3


From c394cc9fbb367f87faa2228ec2eabacd2d4701c6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:11 -0700
Subject: [PATCH] introduce TASK_DEAD state

I am not sure about this patch, I am asking Ingo to take a decision.

task_struct->state == EXIT_DEAD is a very special case, to avoid a confusion
it makes sense to introduce a new state, TASK_DEAD, while EXIT_DEAD should
live only in ->exit_state as documented in sched.h.

Note that this state is not visible to user-space, get_task_state() masks off
unsuitable states.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 +
 kernel/exit.c         | 2 +-
 kernel/sched.c        | 8 ++++----
 mm/oom_kill.c         | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9763de334f09..a06fc89cf6e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -148,6 +148,7 @@ extern unsigned long weighted_cpuload(const int cpu);
 #define EXIT_DEAD		32
 /* in tsk->state again */
 #define TASK_NONINTERACTIVE	64
+#define TASK_DEAD		128
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
diff --git a/kernel/exit.c b/kernel/exit.c
index 9dd5f1336da2..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -955,7 +955,7 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
-	tsk->state = EXIT_DEAD;
+	tsk->state = TASK_DEAD;
 
 	schedule();
 	BUG();
diff --git a/kernel/sched.c b/kernel/sched.c
index a9405d7cc6ab..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1761,10 +1761,10 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 
 	/*
 	 * A task struct has one reference for the use as "current".
-	 * If a task dies, then it sets EXIT_DEAD in tsk->state and calls
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
-	 * The test for EXIT_DEAD must occur while the runqueue locks are
+	 * The test for TASK_DEAD must occur while the runqueue locks are
 	 * still held, otherwise prev could be scheduled on another cpu, die
 	 * there before we look at prev->state, and then the reference would
 	 * be dropped twice.
@@ -1775,7 +1775,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_state == EXIT_DEAD)) {
+	if (unlikely(prev_state == TASK_DEAD)) {
 		/*
 		 * Remove function-return probe instances associated with this
 		 * task and put them back on the free list.
@@ -5153,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 	BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
 	/* Cannot have done final schedule yet: would have vanished. */
-	BUG_ON(p->state == EXIT_DEAD);
+	BUG_ON(p->state == TASK_DEAD);
 
 	get_task_struct(p);
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 202f186a753a..21f0a7e8e514 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -227,7 +227,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 						p->flags & PF_EXITING;
 		if (releasing) {
 			/* TASK_DEAD tasks have already released their mm */
-			if (p->state == EXIT_DEAD)
+			if (p->state == TASK_DEAD)
 				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
-- 
cgit v1.2.3


From 28324d1df646521256e83389244adcce98e89ff2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:12 -0700
Subject: [PATCH] select_bad_process(): kill a bogus PF_DEAD/TASK_DEAD check

The only one usage of TASK_DEAD outside of last schedule path,

select_bad_process:

	for_each_task(p) {

		if (!p->mm)
			continue;
		...
			if (p->state == TASK_DEAD)
				continue;
		...

TASK_DEAD state is set at the end of do_exit(), this means that p->mm
was already set == NULL by exit_mm(), so this task was already rejected
by 'if (!p->mm)' above.

Note also that the caller holds tasklist_lock, this means that p can't
pass exit_notify() and then set TASK_DEAD when p->mm != NULL.

Also, remove open-coded is_init().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 21f0a7e8e514..423dcae323a5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -206,11 +206,14 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		unsigned long points;
 		int releasing;
 
-		/* skip kernel threads */
+		/*
+		 * skip kernel threads and tasks which have already released
+		 * their mm.
+		 */
 		if (!p->mm)
 			continue;
-		/* skip the init task with pid == 1 */
-		if (p->pid == 1)
+		/* skip the init task */
+		if (is_init(p))
 			continue;
 
 		/*
@@ -226,9 +229,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
 						p->flags & PF_EXITING;
 		if (releasing) {
-			/* TASK_DEAD tasks have already released their mm */
-			if (p->state == TASK_DEAD)
-				continue;
 			if (p->flags & PF_EXITING && p == current) {
 				chosen = p;
 				*ppoints = ULONG_MAX;
-- 
cgit v1.2.3


From 972c4ea59c9dbf82647ee9665d9e945241911a51 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:12 -0700
Subject: [PATCH] select_bad_process(): cleanup 'releasing' check

No logic changes, but imho easier to read.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 423dcae323a5..991bf0cf4778 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -204,7 +204,6 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 	do_posix_clock_monotonic_gettime(&uptime);
 	do_each_thread(g, p) {
 		unsigned long points;
-		int releasing;
 
 		/*
 		 * skip kernel threads and tasks which have already released
@@ -226,16 +225,15 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		 * the process of exiting and releasing its resources.
 		 * Otherwise we could get an OOM deadlock.
 		 */
-		releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
-						p->flags & PF_EXITING;
-		if (releasing) {
-			if (p->flags & PF_EXITING && p == current) {
-				chosen = p;
-				*ppoints = ULONG_MAX;
-				break;
-			}
-			return ERR_PTR(-1UL);
+		if ((p->flags & PF_EXITING) && p == current) {
+			chosen = p;
+			*ppoints = ULONG_MAX;
+			break;
 		}
+		if ((p->flags & PF_EXITING) ||
+				test_tsk_thread_flag(p, TIF_MEMDIE))
+			return ERR_PTR(-1UL);
+
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
 
@@ -245,6 +243,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 			*ppoints = points;
 		}
 	} while_each_thread(g, p);
+
 	return chosen;
 }
 
-- 
cgit v1.2.3


From 01017a227044d64face2588fab9427a1da1bdb9f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 29 Sep 2006 02:01:13 -0700
Subject: [PATCH] oom_kill_task(): cleanup ->mm checks

- It is not possible to have task->mm == &init_mm.

- task_lock() buys nothing for 'if (!p->mm)' check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 991bf0cf4778..a5493a3b4851 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -260,14 +260,11 @@ static void __oom_kill_task(struct task_struct *p, const char *message)
 		return;
 	}
 
-	task_lock(p);
-	if (!p->mm || p->mm == &init_mm) {
+	if (!p->mm) {
 		WARN_ON(1);
 		printk(KERN_WARNING "tried to kill an mm-less task!\n");
-		task_unlock(p);
 		return;
 	}
-	task_unlock(p);
 
 	if (message) {
 		printk(KERN_ERR "%s: Killed process %d (%s).\n",
@@ -301,7 +298,7 @@ static int oom_kill_task(struct task_struct *p, const char *message)
 	 * However, this is of no concern to us.
 	 */
 
-	if (mm == NULL || mm == &init_mm)
+	if (mm == NULL)
 		return 1;
 
 	__oom_kill_task(p, message);
-- 
cgit v1.2.3


From b78483a4ba60d5d90930262a533a784e1d9df660 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 29 Sep 2006 02:01:14 -0700
Subject: [PATCH] oom: don't kill current when another OOM in progress

A previous patch to allow an exiting task to OOM kill itself (and thereby
avoid a little deadlock) introduced a problem.  We don't want the
PF_EXITING task, even if it is 'current', to access mem reserves if there
is already a TIF_MEMDIE process in the system sucking up reserves.

Also make the commenting a little bit clearer, and note that our current
scheme of effectively single threading the OOM killer is not itself
perfect.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/oom_kill.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a5493a3b4851..20f41b082e16 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -215,6 +215,18 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		if (is_init(p))
 			continue;
 
+		/*
+		 * This task already has access to memory reserves and is
+		 * being killed. Don't allow any other task access to the
+		 * memory reserve.
+		 *
+		 * Note: this may have a chance of deadlock if it gets
+		 * blocked waiting for another task which itself is waiting
+		 * for memory. Is there a better alternative?
+		 */
+		if (test_tsk_thread_flag(p, TIF_MEMDIE))
+			return ERR_PTR(-1UL);
+
 		/*
 		 * This is in the process of releasing memory so wait for it
 		 * to finish before killing some other task by mistake.
@@ -223,16 +235,15 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 		 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
 		 * which will allow it to gain access to memory reserves in
 		 * the process of exiting and releasing its resources.
-		 * Otherwise we could get an OOM deadlock.
+		 * Otherwise we could get an easy OOM deadlock.
 		 */
-		if ((p->flags & PF_EXITING) && p == current) {
+		if (p->flags & PF_EXITING) {
+			if (p != current)
+				return ERR_PTR(-1UL);
+
 			chosen = p;
 			*ppoints = ULONG_MAX;
-			break;
 		}
-		if ((p->flags & PF_EXITING) ||
-				test_tsk_thread_flag(p, TIF_MEMDIE))
-			return ERR_PTR(-1UL);
 
 		if (p->oomkilladj == OOM_DISABLE)
 			continue;
-- 
cgit v1.2.3


From 38837fc75acb7fa9b0e111b0241fe4fe76c5d4b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 29 Sep 2006 02:01:16 -0700
Subject: [PATCH] cpuset: top_cpuset tracks hotplug changes to node_online_map

Change the list of memory nodes allowed to tasks in the top (root) nodeset
to dynamically track what cpus are online, using a call to a cpuset hook
from the memory hotplug code.  Make this top cpus file read-only.

On systems that have cpusets configured in their kernel, but that aren't
actively using cpusets (for some distros, this covers the majority of
systems) all tasks end up in the top cpuset.

If that system does support memory hotplug, then these tasks cannot make
use of memory nodes that are added after system boot, because the memory
nodes are not allowed in the top cpuset.  This is a surprising regression
over earlier kernels that didn't have cpusets enabled.

One key motivation for this change is to remain consistent with the
behaviour for the top_cpuset's 'cpus', which is also read-only, and which
automatically tracks the cpu_online_map.

This change also has the minor benefit that it fixes a long standing,
little noticed, minor bug in cpusets.  The cpuset performance tweak to
short circuit the cpuset_zone_allowed() check on systems with just a single
cpuset (see 'number_of_cpusets', in linux/cpuset.h) meant that simply
changing the 'mems' of the top_cpuset had no affect, even though the change
(the write system call) appeared to succeed.  With the following change,
that write to the 'mems' file fails -EACCES, and the 'mems' file stubbornly
refuses to be changed via user space writes.  Thus no one should be mislead
into thinking they've changed the top_cpusets's 'mems' when in affect they
haven't.

In order to keep the behaviour of cpusets consistent between systems
actively making use of them and systems not using them, this patch changes
the behaviour of the 'mems' file in the top (root) cpuset, making it read
only, and making it automatically track the value of node_online_map.  Thus
tasks in the top cpuset will have automatic use of hot plugged memory nodes
allowed by their cpuset.

[akpm@osdl.org: build fix]
[bunk@stusta.de: build fix]
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cpusets.txt | 10 +++++-----
 include/linux/cpuset.h    |  4 ++++
 kernel/cpuset.c           | 28 +++++++++++++++++++++++++---
 mm/memory_hotplug.c       |  3 +++
 4 files changed, 37 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 76b44290c154..842f0d1ab216 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -217,11 +217,11 @@ exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
 to represent the cpuset hierarchy provides for a familiar permission
 and name space for cpusets, with a minimum of additional kernel code.
 
-The cpus file in the root (top_cpuset) cpuset is read-only.
-It automatically tracks the value of cpu_online_map, using a CPU
-hotplug notifier.  If and when memory nodes can be hotplugged,
-we expect to make the mems file in the root cpuset read-only
-as well, and have it track the value of node_online_map.
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_map using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_online_map using the
+cpuset_track_online_nodes() hook.
 
 
 1.4 What are exclusive cpusets ?
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 9354722a9217..4d8adf663681 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -63,6 +63,8 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return current->flags & PF_SPREAD_SLAB;
 }
 
+extern void cpuset_track_online_nodes(void);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init_early(void) { return 0; }
@@ -126,6 +128,8 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return 0;
 }
 
+static inline void cpuset_track_online_nodes(void) {}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 584bb4e6c042..794af5024c2f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -912,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	int fudge;
 	int retval;
 
+	/* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+	if (cs == &top_cpuset)
+		return -EACCES;
+
 	trialcs = *cs;
 	retval = nodelist_parse(buf, trialcs.mems_allowed);
 	if (retval < 0)
@@ -2042,9 +2046,8 @@ out:
  * (of no affect) on systems that are actively using CPU hotplug
  * but making no active use of cpusets.
  *
- * This handles CPU hotplug (cpuhp) events.  If someday Memory
- * Nodes can be hotplugged (dynamically changing node_online_map)
- * then we should handle that too, perhaps in a similar way.
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
  */
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -2063,6 +2066,25 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
 }
 #endif
 
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void cpuset_track_online_nodes()
+{
+	mutex_lock(&manage_mutex);
+	mutex_lock(&callback_mutex);
+
+	top_cpuset.mems_allowed = node_online_map;
+
+	mutex_unlock(&callback_mutex);
+	mutex_unlock(&manage_mutex);
+}
+#endif
+
 /**
  * cpuset_init_smp - initialize cpus_allowed
  *
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c37319542b70..9576ed920c0a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
+#include <linux/cpuset.h>
 
 #include <asm/tlbflush.h>
 
@@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
+	cpuset_track_online_nodes();
+
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3


From 40c99aae23529f3d069ae08836ae46fadb3fd2bd Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Fri, 29 Sep 2006 02:01:24 -0700
Subject: [PATCH] remove static variable mm/page-writeback.c:total_pages

page-writeback.c has a static local variable "total_pages", which is the
total number of pages in the system.

There is a global variable "vm_total_pages", which is the total number of
pages the VM controls.

Both are assigned from the return value of nr_free_pagecache_pages().

This patch removes the local variable and uses the global variable in that
place.

One more issue with the local static variable "total_pages" is that it is
not updated when new pages are hot-added.  Since vm_total_pages is updated
when new pages are hot-added, this patch fixes that problem too.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page-writeback.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 555752907dc3..efd2705e4986 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,6 @@
  */
 static long ratelimit_pages = 32;
 
-static long total_pages;	/* The total number of pages in the machine. */
 static int dirty_exceeded __cacheline_aligned_in_smp;	/* Dirty mem may be over limit */
 
 /*
@@ -126,7 +125,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
 	int unmapped_ratio;
 	long background;
 	long dirty;
-	unsigned long available_memory = total_pages;
+	unsigned long available_memory = vm_total_pages;
 	struct task_struct *tsk;
 
 #ifdef CONFIG_HIGHMEM
@@ -141,7 +140,7 @@ get_dirty_limits(long *pbackground, long *pdirty,
 
 	unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
 				global_page_state(NR_ANON_PAGES)) * 100) /
-					total_pages;
+					vm_total_pages;
 
 	dirty_ratio = vm_dirty_ratio;
 	if (dirty_ratio > unmapped_ratio / 2)
@@ -504,7 +503,7 @@ void laptop_sync_completion(void)
 
 static void set_ratelimit(void)
 {
-	ratelimit_pages = total_pages / (num_online_cpus() * 32);
+	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
 		ratelimit_pages = 16;
 	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
@@ -533,9 +532,7 @@ void __init page_writeback_init(void)
 	long buffer_pages = nr_free_buffer_pages();
 	long correction;
 
-	total_pages = nr_free_pagecache_pages();
-
-	correction = (100 * 4 * buffer_pages) / total_pages;
+	correction = (100 * 4 * buffer_pages) / vm_total_pages;
 
 	if (correction < 100) {
 		dirty_background_ratio *= correction;
-- 
cgit v1.2.3


From 2d1d43f6a43b703587e759145f69467e7c6553a7 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Fri, 29 Sep 2006 02:01:25 -0700
Subject: [PATCH] call mm/page-writeback.c:set_ratelimit() when new pages are
 hot-added

ratelimit_pages in page-writeback.c is recalculated (in set_ratelimit())
every time a CPU is hot-added/removed.  But this value is not recalculated
when new pages are hot-added.

This patch fixes that problem by calling set_ratelimit() when new pages
are hot-added.

[akpm@osdl.org: cleanups]
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h | 1 +
 mm/memory_hotplug.c       | 2 ++
 mm/page-writeback.c       | 6 +++---
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56a23a0e7f2e..9d4074ecd0cd 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -117,6 +117,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 			   loff_t pos, loff_t count);
 void set_page_dirty_balance(struct page *page);
+void writeback_set_ratelimit(void);
 
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9576ed920c0a..2053bb165a21 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/pagevec.h>
+#include <linux/writeback.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
@@ -192,6 +193,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	if (need_zonelists_rebuild)
 		build_all_zonelists();
 	vm_total_pages = nr_free_pagecache_pages();
+	writeback_set_ratelimit();
 	return 0;
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index efd2705e4986..488b7088557c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -501,7 +501,7 @@ void laptop_sync_completion(void)
  * will write six megabyte chunks, max.
  */
 
-static void set_ratelimit(void)
+void writeback_set_ratelimit(void)
 {
 	ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
 	if (ratelimit_pages < 16)
@@ -513,7 +513,7 @@ static void set_ratelimit(void)
 static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
-	set_ratelimit();
+	writeback_set_ratelimit();
 	return 0;
 }
 
@@ -546,7 +546,7 @@ void __init page_writeback_init(void)
 			vm_dirty_ratio = 1;
 	}
 	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
-	set_ratelimit();
+	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
 
-- 
cgit v1.2.3


From 3f9e7949f86dfe2bd9a1ad0604f78e7683c059de Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Fri, 29 Sep 2006 02:01:26 -0700
Subject: [PATCH] valid_swaphandles() fix

akpm draws my attention to the fact that sysctl(VM_PAGE_CLUSTER) might
conceivably change page_cluster to 0 while valid_swaphandles() is in the
middle of using it, leading to an embarrassingly long loop: take a local
snapshot of page_cluster and work with that.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index f1f5ec783781..a15def63f28f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type)
  */
 int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
 {
-	int ret = 0, i = 1 << page_cluster;
+	int our_page_cluster = page_cluster;
+	int ret = 0, i = 1 << our_page_cluster;
 	unsigned long toff;
 	struct swap_info_struct *swapdev = swp_type(entry) + swap_info;
 
-	if (!page_cluster)	/* no readahead */
+	if (!our_page_cluster)	/* no readahead */
 		return 0;
-	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
+	toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster;
 	if (!toff)		/* first page is swap header */
 		toff++, i--;
 	*offset = toff;
-- 
cgit v1.2.3


From 39f0247d3823e4e0bf8f6838a10362864b1e1053 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Fri, 29 Sep 2006 02:01:35 -0700
Subject: [PATCH] Access Control Lists for tmpfs

Add access control lists for tmpfs.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig               |  13 ++++
 include/linux/shmem_fs.h |  24 ++++++
 mm/Makefile              |   1 +
 mm/shmem.c               |  99 +++++++++++++++++++++++-
 mm/shmem_acl.c           | 197 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 332 insertions(+), 2 deletions(-)
 create mode 100644 mm/shmem_acl.c

(limited to 'mm')

diff --git a/fs/Kconfig b/fs/Kconfig
index deb9eec9f6ee..4fd9efac29ab 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -881,6 +881,19 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
+config TMPFS_POSIX_ACL
+	bool "Tmpfs POSIX Access Control Lists"
+	depends on TMPFS
+	select GENERIC_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index c057f0b32318..f3c51899117f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -19,6 +19,10 @@ struct shmem_inode_info {
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* first blocks */
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct inode		vfs_inode;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
+#endif
 };
 
 struct shmem_sb_info {
@@ -36,4 +40,24 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
 	return container_of(inode, struct shmem_inode_info, vfs_inode);
 }
 
+#ifdef CONFIG_TMPFS_POSIX_ACL
+int shmem_permission(struct inode *, int, struct nameidata *);
+int shmem_acl_init(struct inode *, struct inode *);
+void shmem_acl_destroy_inode(struct inode *);
+
+extern struct xattr_handler shmem_xattr_acl_access_handler;
+extern struct xattr_handler shmem_xattr_acl_default_handler;
+
+extern struct generic_acl_operations shmem_acl_ops;
+
+#else
+static inline int shmem_acl_init(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+static inline void shmem_acl_destroy_inode(struct inode *inode)
+{
+}
+#endif  /* CONFIG_TMPFS_POSIX_ACL */
+
 #endif
diff --git a/mm/Makefile b/mm/Makefile
index 60c56c0b5e10..6200c6d6afd2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
+obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/shmem.c b/mm/shmem.c
index eda907c3a86a..b96de69f236b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -26,6 +26,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/file.h>
@@ -177,6 +179,7 @@ static const struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
+static struct inode_operations shmem_special_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
@@ -637,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 	struct page *page = NULL;
 	int error;
 
-	if (attr->ia_valid & ATTR_SIZE) {
+	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
 		if (attr->ia_size < inode->i_size) {
 			/*
 			 * If truncating down to a partial page, then
@@ -670,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 	error = inode_change_ok(inode, attr);
 	if (!error)
 		error = inode_setattr(inode, attr);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	if (!error && (attr->ia_valid & ATTR_MODE))
+		error = generic_acl_chmod(inode, &shmem_acl_ops);
+#endif
 	if (page)
 		page_cache_release(page);
 	return error;
@@ -1362,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 
 		switch (mode & S_IFMT) {
 		default:
+			inode->i_op = &shmem_special_inode_operations;
 			init_special_inode(inode, mode, dev);
 			break;
 		case S_IFREG:
@@ -1682,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 				iput(inode);
 				return error;
 			}
-			error = 0;
+		}
+		error = shmem_acl_init(inode, dir);
+		if (error) {
+			iput(inode);
+			return error;
 		}
 		if (dir->i_mode & S_ISGID) {
 			inode->i_gid = dir->i_gid;
@@ -1897,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = {
 	.put_link	= shmem_put_link,
 };
 
+#ifdef CONFIG_TMPFS_POSIX_ACL
+/**
+ * Superblocks without xattr inode operations will get security.* xattr
+ * support from the VFS "for free". As soon as we have any other xattrs
+ * like ACLs, we also need to implement the security.* handlers at
+ * filesystem level, though.
+ */
+
+static size_t shmem_xattr_security_list(struct inode *inode, char *list,
+					size_t list_len, const char *name,
+					size_t name_len)
+{
+	return security_inode_listsecurity(inode, list, list_len);
+}
+
+static int shmem_xattr_security_get(struct inode *inode, const char *name,
+				    void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return security_inode_getsecurity(inode, name, buffer, size,
+					  -EOPNOTSUPP);
+}
+
+static int shmem_xattr_security_set(struct inode *inode, const char *name,
+				    const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return security_inode_setsecurity(inode, name, value, size, flags);
+}
+
+struct xattr_handler shmem_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list   = shmem_xattr_security_list,
+	.get    = shmem_xattr_security_get,
+	.set    = shmem_xattr_security_set,
+};
+
+static struct xattr_handler *shmem_xattr_handlers[] = {
+	&shmem_xattr_acl_access_handler,
+	&shmem_xattr_acl_default_handler,
+	&shmem_xattr_security_handler,
+	NULL
+};
+#endif
+
 static int shmem_parse_options(char *options, int *mode, uid_t *uid,
 	gid_t *gid, unsigned long *blocks, unsigned long *inodes,
 	int *policy, nodemask_t *policy_nodes)
@@ -2094,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb,
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
 	sb->s_time_gran = 1;
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	sb->s_xattr = shmem_xattr_handlers;
+	sb->s_flags |= MS_POSIXACL;
+#endif
 
 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
 	if (!inode)
@@ -2130,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode)
 		/* only struct inode is valid if it's an inline symlink */
 		mpol_free_shared_policy(&SHMEM_I(inode)->policy);
 	}
+	shmem_acl_destroy_inode(inode);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -2141,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep,
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		inode_init_once(&p->vfs_inode);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+		p->i_acl = NULL;
+		p->i_default_acl = NULL;
+#endif
 	}
 }
 
@@ -2184,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = {
 	.truncate	= shmem_truncate,
 	.setattr	= shmem_notify_change,
 	.truncate_range	= shmem_truncate_range,
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+	.permission	= shmem_permission,
+#endif
+
 };
 
 static struct inode_operations shmem_dir_inode_operations = {
@@ -2198,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = {
 	.mknod		= shmem_mknod,
 	.rename		= shmem_rename,
 #endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	.setattr	= shmem_notify_change,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+	.permission	= shmem_permission,
+#endif
+};
+
+static struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+	.setattr	= shmem_notify_change,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+	.permission	= shmem_permission,
+#endif
 };
 
 static struct super_operations shmem_ops = {
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
new file mode 100644
index 000000000000..c946bf468718
--- /dev/null
+++ b/mm/shmem_acl.c
@@ -0,0 +1,197 @@
+/*
+ * mm/shmem_acl.c
+ *
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/xattr.h>
+#include <linux/generic_acl.h>
+
+/**
+ * shmem_get_acl  -   generic_acl_operations->getacl() operation
+ */
+static struct posix_acl *
+shmem_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl = NULL;
+
+	spin_lock(&inode->i_lock);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = posix_acl_dup(SHMEM_I(inode)->i_acl);
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl);
+			break;
+	}
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+/**
+ * shmem_get_acl  -   generic_acl_operations->setacl() operation
+ */
+static void
+shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct posix_acl *free = NULL;
+
+	spin_lock(&inode->i_lock);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			free = SHMEM_I(inode)->i_acl;
+			SHMEM_I(inode)->i_acl = posix_acl_dup(acl);
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			free = SHMEM_I(inode)->i_default_acl;
+			SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl);
+			break;
+	}
+	spin_unlock(&inode->i_lock);
+	posix_acl_release(free);
+}
+
+struct generic_acl_operations shmem_acl_ops = {
+	.getacl = shmem_get_acl,
+	.setacl = shmem_set_acl,
+};
+
+/**
+ * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
+ * shmem_xattr_acl_access_handler  -  plumbing code to implement the
+ * system.posix_acl_access xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
+		      const char *name, size_t name_len)
+{
+	return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
+				list, list_size);
+}
+
+static int
+shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
+		     size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
+			       size);
+}
+
+static int
+shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
+		     size_t size, int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
+			       size);
+}
+
+struct xattr_handler shmem_xattr_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.list	= shmem_list_acl_access,
+	.get	= shmem_get_acl_access,
+	.set	= shmem_set_acl_access,
+};
+
+/**
+ * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
+ * shmem_xattr_acl_default_handler  -  plumbing code to implement the
+ * system.posix_acl_default xattr using the generic acl functions.
+ */
+
+static size_t
+shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
+		       const char *name, size_t name_len)
+{
+	return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
+				list, list_size);
+}
+
+static int
+shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
+		      size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
+			       size);
+}
+
+static int
+shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
+		      size_t size, int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
+			       size);
+}
+
+struct xattr_handler shmem_xattr_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.list	= shmem_list_acl_default,
+	.get	= shmem_get_acl_default,
+	.set	= shmem_set_acl_default,
+};
+
+/**
+ * shmem_acl_init  -  Inizialize the acl(s) of a new inode
+ */
+int
+shmem_acl_init(struct inode *inode, struct inode *dir)
+{
+	return generic_acl_init(inode, dir, &shmem_acl_ops);
+}
+
+/**
+ * shmem_acl_destroy_inode  -  destroy acls hanging off the in-memory inode
+ *
+ * This is done before destroying the actual inode.
+ */
+
+void
+shmem_acl_destroy_inode(struct inode *inode)
+{
+	if (SHMEM_I(inode)->i_acl)
+		posix_acl_release(SHMEM_I(inode)->i_acl);
+	SHMEM_I(inode)->i_acl = NULL;
+	if (SHMEM_I(inode)->i_default_acl)
+		posix_acl_release(SHMEM_I(inode)->i_default_acl);
+	SHMEM_I(inode)->i_default_acl = NULL;
+}
+
+/**
+ * shmem_check_acl  -  check_acl() callback for generic_permission()
+ */
+static int
+shmem_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+	return -EAGAIN;
+}
+
+/**
+ * shmem_permission  -  permission() inode operation
+ */
+int
+shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, shmem_check_acl);
+}
-- 
cgit v1.2.3


From cf9a2ae8d49948f861b56e5333530e491a9da190 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:05:54 +0100
Subject: [PATCH] BLOCK: Move functions out of buffer code [try #6]

Move some functions out of the buffering code that aren't strictly buffering
specific.  This is a precursor to being able to disable the block layer.

 (*) Moved some stuff out of fs/buffer.c:

     (*) The file sync and general sync stuff moved to fs/sync.c.

     (*) The superblock sync stuff moved to fs/super.c.

     (*) do_invalidatepage() moved to mm/truncate.c.

     (*) try_to_release_page() moved to mm/filemap.c.

 (*) Moved some related declarations between header files:

     (*) declarations for do_invalidatepage() and try_to_release_page() moved
     	 to linux/mm.h.

     (*) __set_page_dirty_buffers() moved to linux/buffer_head.h.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c                 | 174 --------------------------------------------
 fs/super.c                  |  31 ++++++++
 fs/sync.c                   | 113 ++++++++++++++++++++++++++++
 include/linux/buffer_head.h |   3 +-
 include/linux/fs.h          |   1 +
 include/linux/mm.h          |   4 +-
 mm/filemap.c                |  30 ++++++++
 mm/page-writeback.c         |   1 +
 mm/truncate.c               |  24 ++++++
 9 files changed, 204 insertions(+), 177 deletions(-)

(limited to 'mm')

diff --git a/fs/buffer.c b/fs/buffer.c
index 3b6d701073e7..16cfbcd254f1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -159,31 +159,6 @@ int sync_blockdev(struct block_device *bdev)
 }
 EXPORT_SYMBOL(sync_blockdev);
 
-static void __fsync_super(struct super_block *sb)
-{
-	sync_inodes_sb(sb, 0);
-	DQUOT_SYNC(sb);
-	lock_super(sb);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	sync_blockdev(sb->s_bdev);
-	sync_inodes_sb(sb, 1);
-}
-
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-	__fsync_super(sb);
-	return sync_blockdev(sb->s_bdev);
-}
-
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
@@ -259,118 +234,6 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 }
 EXPORT_SYMBOL(thaw_bdev);
 
-/*
- * sync everything.  Start out by waking pdflush, because that writes back
- * all queues in parallel.
- */
-static void do_sync(unsigned long wait)
-{
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	DQUOT_SYNC(NULL);
-	sync_supers();		/* Write the superblocks */
-	sync_filesystems(0);	/* Start syncing the filesystems */
-	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
-	if (!wait)
-		printk("Emergency Sync complete\n");
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
-}
-
-asmlinkage long sys_sync(void)
-{
-	do_sync(1);
-	return 0;
-}
-
-void emergency_sync(void)
-{
-	pdflush_operation(do_sync, 0);
-}
-
-/*
- * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
- */
- 
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
-{
-	struct inode * inode = dentry->d_inode;
-	struct super_block * sb;
-	int ret, err;
-
-	/* sync the inode to buffers */
-	ret = write_inode_now(inode, 0);
-
-	/* sync the superblock to buffers */
-	sb = inode->i_sb;
-	lock_super(sb);
-	if (sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-
-	/* .. finally sync the buffers to disk */
-	err = sync_blockdev(sb->s_bdev);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-long do_fsync(struct file *file, int datasync)
-{
-	int ret;
-	int err;
-	struct address_space *mapping = file->f_mapping;
-
-	if (!file->f_op || !file->f_op->fsync) {
-		/* Why?  We can still call filemap_fdatawrite */
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = filemap_fdatawrite(mapping);
-
-	/*
-	 * We need to protect against concurrent writers, which could cause
-	 * livelocks in fsync_buffers_list().
-	 */
-	mutex_lock(&mapping->host->i_mutex);
-	err = file->f_op->fsync(file, file->f_dentry, datasync);
-	if (!ret)
-		ret = err;
-	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-}
-
-static long __do_fsync(unsigned int fd, int datasync)
-{
-	struct file *file;
-	int ret = -EBADF;
-
-	file = fget(fd);
-	if (file) {
-		ret = do_fsync(file, datasync);
-		fput(file);
-	}
-	return ret;
-}
-
-asmlinkage long sys_fsync(unsigned int fd)
-{
-	return __do_fsync(fd, 0);
-}
-
-asmlinkage long sys_fdatasync(unsigned int fd)
-{
-	return __do_fsync(fd, 1);
-}
-
 /*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
@@ -1550,35 +1413,6 @@ static void discard_buffer(struct buffer_head * bh)
 	unlock_buffer(bh);
 }
 
-/**
- * try_to_release_page() - release old fs-specific metadata on a page
- *
- * @page: the page which the kernel is trying to free
- * @gfp_mask: memory allocation flags (and I/O mode)
- *
- * The address_space is to try to release any data against the page
- * (presumably at page->private).  If the release was successful, return `1'.
- * Otherwise return zero.
- *
- * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
- *
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
- */
-int try_to_release_page(struct page *page, gfp_t gfp_mask)
-{
-	struct address_space * const mapping = page->mapping;
-
-	BUG_ON(!PageLocked(page));
-	if (PageWriteback(page))
-		return 0;
-	
-	if (mapping && mapping->a_ops->releasepage)
-		return mapping->a_ops->releasepage(page, gfp_mask);
-	return try_to_free_buffers(page);
-}
-EXPORT_SYMBOL(try_to_release_page);
-
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
  *
@@ -1630,14 +1464,6 @@ out:
 }
 EXPORT_SYMBOL(block_invalidatepage);
 
-void do_invalidatepage(struct page *page, unsigned long offset)
-{
-	void (*invalidatepage)(struct page *, unsigned long);
-	invalidatepage = page->mapping->a_ops->invalidatepage ? :
-		block_invalidatepage;
-	(*invalidatepage)(page, offset);
-}
-
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
diff --git a/fs/super.c b/fs/super.c
index 6987824d0dce..15671cd048b1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -220,6 +220,37 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
 	return 0;
 }
 
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.  Requires a second blkdev
+ * flush by the caller to complete the operation.
+ */
+void __fsync_super(struct super_block *sb)
+{
+	sync_inodes_sb(sb, 0);
+	DQUOT_SYNC(sb);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	sync_inodes_sb(sb, 1);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	__fsync_super(sb);
+	return sync_blockdev(sb->s_bdev);
+}
+
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
diff --git a/fs/sync.c b/fs/sync.c
index 955aef04da28..1de747b5ddb9 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -10,10 +10,123 @@
 #include <linux/syscalls.h>
 #include <linux/linkage.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
+/*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
+static void do_sync(unsigned long wait)
+{
+	wakeup_pdflush(0);
+	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
+	DQUOT_SYNC(NULL);
+	sync_supers();		/* Write the superblocks */
+	sync_filesystems(0);	/* Start syncing the filesystems */
+	sync_filesystems(wait);	/* Waitingly sync the filesystems */
+	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
+	if (!wait)
+		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
+}
+
+asmlinkage long sys_sync(void)
+{
+	do_sync(1);
+	return 0;
+}
+
+void emergency_sync(void)
+{
+	pdflush_operation(do_sync, 0);
+}
+
+/*
+ * Generic function to fsync a file.
+ *
+ * filp may be NULL if called via the msync of a vma.
+ */
+int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	lock_super(sb);
+	if (sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+long do_fsync(struct file *file, int datasync)
+{
+	int ret;
+	int err;
+	struct address_space *mapping = file->f_mapping;
+
+	if (!file->f_op || !file->f_op->fsync) {
+		/* Why?  We can still call filemap_fdatawrite */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = filemap_fdatawrite(mapping);
+
+	/*
+	 * We need to protect against concurrent writers, which could cause
+	 * livelocks in fsync_buffers_list().
+	 */
+	mutex_lock(&mapping->host->i_mutex);
+	err = file->f_op->fsync(file, file->f_dentry, datasync);
+	if (!ret)
+		ret = err;
+	mutex_unlock(&mapping->host->i_mutex);
+	err = filemap_fdatawait(mapping);
+	if (!ret)
+		ret = err;
+out:
+	return ret;
+}
+
+static long __do_fsync(unsigned int fd, int datasync)
+{
+	struct file *file;
+	int ret = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		ret = do_fsync(file, datasync);
+		fput(file);
+	}
+	return ret;
+}
+
+asmlinkage long sys_fsync(unsigned int fd)
+{
+	return __do_fsync(fd, 0);
+}
+
+asmlinkage long sys_fdatasync(unsigned int fd)
+{
+	return __do_fsync(fd, 1);
+}
+
 /*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 737e407d0cd1..64b508e35d2a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -190,9 +190,7 @@ extern int buffer_heads_over_limit;
  * Generic address_space_operations implementations for buffer_head-backed
  * address_spaces.
  */
-int try_to_release_page(struct page * page, gfp_t gfp_mask);
 void block_invalidatepage(struct page *page, unsigned long offset);
-void do_invalidatepage(struct page *page, unsigned long offset);
 int block_write_full_page(struct page *page, get_block_t *get_block,
 				struct writeback_control *wbc);
 int block_read_full_page(struct page*, get_block_t*);
@@ -302,4 +300,5 @@ static inline void lock_buffer(struct buffer_head *bh)
 		__lock_buffer(bh);
 }
 
+extern int __set_page_dirty_buffers(struct page *page);
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d68c37af4dfb..1728142ec4b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1546,6 +1546,7 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping,
 extern long do_fsync(struct file *file, int datasync);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
+extern void __fsync_super(struct super_block *sb);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b703b6d4358..4edf1934e5ca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -743,7 +743,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
 void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
-int __set_page_dirty_buffers(struct page *page);
+extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
+extern void do_invalidatepage(struct page *page, unsigned long offset);
+
 int __set_page_dirty_nobuffers(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
 				struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 3277f3b23524..d6846de08887 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2491,3 +2491,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	}
 	return retval;
 }
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private).  If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+	struct address_space * const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	if (PageWriteback(page))
+		return 0;
+
+	if (mapping && mapping->a_ops->releasepage)
+		return mapping->a_ops->releasepage(page, gfp_mask);
+	return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 488b7088557c..9fdcc7903956 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -30,6 +30,7 @@
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/buffer_head.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
diff --git a/mm/truncate.c b/mm/truncate.c
index a654928323dc..cd3e34b816db 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -17,6 +17,30 @@
 				   do_invalidatepage */
 
 
+/**
+ * do_invalidatepage - invalidate part of all of a page
+ * @page: the page which is affected
+ * @offset: the index of the truncation point
+ *
+ * do_invalidatepage() is called when all or part of the page has become
+ * invalidated by a truncate operation.
+ *
+ * do_invalidatepage() does not have to release all buffers, but it must
+ * ensure that no dirty buffer is left outside @offset and that no I/O
+ * is underway against any of the blocks which are outside the truncation
+ * point.  Because the caller is about to free (and possibly reuse) those
+ * blocks on-disk.
+ */
+void do_invalidatepage(struct page *page, unsigned long offset)
+{
+	void (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (!invalidatepage)
+		invalidatepage = block_invalidatepage;
+	if (invalidatepage)
+		(*invalidatepage)(page, offset);
+}
+
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-- 
cgit v1.2.3


From b398f6bff93a247d2a7099e92905374966e4558f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:05:58 +0100
Subject: [PATCH] BLOCK: Stop fallback_migrate_page() from using
 page_has_buffers() [try #6]

Stop fallback_migrate_page() from using page_has_buffers() since that might not
be available.  Use PagePrivate() instead since that's more general.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 20a8c2687b1e..7f50e3ff54cd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -525,7 +525,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 	 * Buffers may be managed in a filesystem specific way.
 	 * We must have no buffers or drop them.
 	 */
-	if (page_has_buffers(page) &&
+	if (PagePrivate(page) &&
 	    !try_to_release_page(page, GFP_KERNEL))
 		return -EAGAIN;
 
-- 
cgit v1.2.3


From 831058dec3735665fe91bd0d37b6a8cf56b91abd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:06:00 +0100
Subject: [PATCH] BLOCK: Separate the bounce buffering code from the highmem
 code [try #6]

Move the bounce buffer code from mm/highmem.c to mm/bounce.c so that it can be
more easily disabled when the block layer is disabled.

!!!NOTE!!! There may be a bug in this code: Should init_emergency_pool() be
	   contingent on CONFIG_HIGHMEM?

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/Makefile  |   3 +
 mm/bounce.c  | 302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/highmem.c | 281 ------------------------------------------------------
 3 files changed, 305 insertions(+), 281 deletions(-)
 create mode 100644 mm/bounce.c

(limited to 'mm')

diff --git a/mm/Makefile b/mm/Makefile
index 6200c6d6afd2..4f2166a833b9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,6 +12,9 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
+ifeq ($(CONFIG_MMU),y)
+obj-y			+= bounce.o
+endif
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
diff --git a/mm/bounce.c b/mm/bounce.c
new file mode 100644
index 000000000000..e4b62d2a4024
--- /dev/null
+++ b/mm/bounce.c
@@ -0,0 +1,302 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/blktrace_api.h>
+#include <asm/tlbflush.h>
+
+#define POOL_SIZE	64
+#define ISA_POOL_SIZE	16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#ifdef CONFIG_HIGHMEM
+static __init int init_emergency_pool(void)
+{
+	struct sysinfo i;
+	si_meminfo(&i);
+	si_swapinfo(&i);
+
+	if (!i.totalhigh)
+		return 0;
+
+	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+	BUG_ON(!page_pool);
+	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+
+	return 0;
+}
+
+__initcall(init_emergency_pool);
+
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+	unsigned long flags;
+	unsigned char *vto;
+
+	local_irq_save(flags);
+	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
+	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+	kunmap_atomic(vto, KM_BOUNCE_READ);
+	local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom)	\
+	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+	if (isa_page_pool)
+		return 0;
+
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+				       mempool_free_pages, (void *) 0);
+	BUG_ON(!isa_page_pool);
+
+	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
+	return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+	unsigned char *vfrom;
+	struct bio_vec *tovec, *fromvec;
+	int i;
+
+	__bio_for_each_segment(tovec, to, i, 0) {
+		fromvec = from->bi_io_vec + i;
+
+		/*
+		 * not bounced
+		 */
+		if (tovec->bv_page == fromvec->bv_page)
+			continue;
+
+		/*
+		 * fromvec->bv_offset and fromvec->bv_len might have been
+		 * modified by the block layer, so use the original copy,
+		 * bounce_copy_vec already uses tovec->bv_len
+		 */
+		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
+
+		flush_dcache_page(tovec->bv_page);
+		bounce_copy_vec(tovec, vfrom);
+	}
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+	struct bio_vec *bvec, *org_vec;
+	int i;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+	/*
+	 * free up bounce indirect pages used
+	 */
+	__bio_for_each_segment(bvec, bio, i, 0) {
+		org_vec = bio_orig->bi_io_vec + i;
+		if (bvec->bv_page == org_vec->bv_page)
+			continue;
+
+		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+		mempool_free(bvec->bv_page, pool);
+	}
+
+	bio_endio(bio_orig, bio_orig->bi_size, err);
+	bio_put(bio);
+}
+
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	bounce_end_io(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		copy_to_high_bio_irq(bio_orig, bio);
+
+	bounce_end_io(bio, pool, err);
+}
+
+static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, page_pool, err);
+	return 0;
+}
+
+static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
+{
+	if (bio->bi_size)
+		return 1;
+
+	__bounce_end_io_read(bio, isa_page_pool, err);
+	return 0;
+}
+
+static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
+			       mempool_t *pool)
+{
+	struct page *page;
+	struct bio *bio = NULL;
+	int i, rw = bio_data_dir(*bio_orig);
+	struct bio_vec *to, *from;
+
+	bio_for_each_segment(from, *bio_orig, i) {
+		page = from->bv_page;
+
+		/*
+		 * is destination page below bounce pfn?
+		 */
+		if (page_to_pfn(page) < q->bounce_pfn)
+			continue;
+
+		/*
+		 * irk, bounce it
+		 */
+		if (!bio)
+			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
+
+		to = bio->bi_io_vec + i;
+
+		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		to->bv_len = from->bv_len;
+		to->bv_offset = from->bv_offset;
+		inc_zone_page_state(to->bv_page, NR_BOUNCE);
+
+		if (rw == WRITE) {
+			char *vto, *vfrom;
+
+			flush_dcache_page(from->bv_page);
+			vto = page_address(to->bv_page) + to->bv_offset;
+			vfrom = kmap(from->bv_page) + from->bv_offset;
+			memcpy(vto, vfrom, to->bv_len);
+			kunmap(from->bv_page);
+		}
+	}
+
+	/*
+	 * no pages bounced
+	 */
+	if (!bio)
+		return;
+
+	/*
+	 * at least one page was bounced, fill in possible non-highmem
+	 * pages
+	 */
+	__bio_for_each_segment(from, *bio_orig, i, 0) {
+		to = bio_iovec_idx(bio, i);
+		if (!to->bv_page) {
+			to->bv_page = from->bv_page;
+			to->bv_len = from->bv_len;
+			to->bv_offset = from->bv_offset;
+		}
+	}
+
+	bio->bi_bdev = (*bio_orig)->bi_bdev;
+	bio->bi_flags |= (1 << BIO_BOUNCED);
+	bio->bi_sector = (*bio_orig)->bi_sector;
+	bio->bi_rw = (*bio_orig)->bi_rw;
+
+	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
+	bio->bi_idx = (*bio_orig)->bi_idx;
+	bio->bi_size = (*bio_orig)->bi_size;
+
+	if (pool == page_pool) {
+		bio->bi_end_io = bounce_end_io_write;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read;
+	} else {
+		bio->bi_end_io = bounce_end_io_write_isa;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read_isa;
+	}
+
+	bio->bi_private = *bio_orig;
+	*bio_orig = bio;
+}
+
+void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
+{
+	mempool_t *pool;
+
+	/*
+	 * for non-isa bounce case, just check if the bounce pfn is equal
+	 * to or bigger than the highest pfn in the system -- in that case,
+	 * don't waste time iterating over bio segments
+	 */
+	if (!(q->bounce_gfp & GFP_DMA)) {
+		if (q->bounce_pfn >= blk_max_pfn)
+			return;
+		pool = page_pool;
+	} else {
+		BUG_ON(!isa_page_pool);
+		pool = isa_page_pool;
+	}
+
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
+	/*
+	 * slow path
+	 */
+	__blk_queue_bounce(q, bio_orig, pool);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
diff --git a/mm/highmem.c b/mm/highmem.c
index ee5519b176ee..0206e7e5018c 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,13 +29,6 @@
 #include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
-static mempool_t *page_pool, *isa_page_pool;
-
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
-	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-
 /*
  * Virtual_count is not a pure "count".
  *  0 means that it is not mapped, and has not been mapped
@@ -217,282 +210,8 @@ void fastcall kunmap_high(struct page *page)
 }
 
 EXPORT_SYMBOL(kunmap_high);
-
-#define POOL_SIZE	64
-
-static __init int init_emergency_pool(void)
-{
-	struct sysinfo i;
-	si_meminfo(&i);
-	si_swapinfo(&i);
-        
-	if (!i.totalhigh)
-		return 0;
-
-	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
-	BUG_ON(!page_pool);
-	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
-
-	return 0;
-}
-
-__initcall(init_emergency_pool);
-
-/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
-	unsigned long flags;
-	unsigned char *vto;
-
-	local_irq_save(flags);
-	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
-	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
-	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
-}
-
-#else /* CONFIG_HIGHMEM */
-
-#define bounce_copy_vec(to, vfrom)	\
-	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-
 #endif
 
-#define ISA_POOL_SIZE	16
-
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
-	if (isa_page_pool)
-		return 0;
-
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
-				       mempool_free_pages, (void *) 0);
-	BUG_ON(!isa_page_pool);
-
-	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
-	return 0;
-}
-
-/*
- * Simple bounce buffer support for highmem pages. Depending on the
- * queue gfp mask set, *to may or may not be a highmem page. kmap it
- * always, it will do the Right Thing
- */
-static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
-{
-	unsigned char *vfrom;
-	struct bio_vec *tovec, *fromvec;
-	int i;
-
-	__bio_for_each_segment(tovec, to, i, 0) {
-		fromvec = from->bi_io_vec + i;
-
-		/*
-		 * not bounced
-		 */
-		if (tovec->bv_page == fromvec->bv_page)
-			continue;
-
-		/*
-		 * fromvec->bv_offset and fromvec->bv_len might have been
-		 * modified by the block layer, so use the original copy,
-		 * bounce_copy_vec already uses tovec->bv_len
-		 */
-		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
-
-		flush_dcache_page(tovec->bv_page);
-		bounce_copy_vec(tovec, vfrom);
-	}
-}
-
-static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
-{
-	struct bio *bio_orig = bio->bi_private;
-	struct bio_vec *bvec, *org_vec;
-	int i;
-
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
-	/*
-	 * free up bounce indirect pages used
-	 */
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		org_vec = bio_orig->bi_io_vec + i;
-		if (bvec->bv_page == org_vec->bv_page)
-			continue;
-
-		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-		mempool_free(bvec->bv_page, pool);
-	}
-
-	bio_endio(bio_orig, bio_orig->bi_size, err);
-	bio_put(bio);
-}
-
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	bounce_end_io(bio, page_pool, err);
-	return 0;
-}
-
-static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	bounce_end_io(bio, isa_page_pool, err);
-	return 0;
-}
-
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
-{
-	struct bio *bio_orig = bio->bi_private;
-
-	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-		copy_to_high_bio_irq(bio_orig, bio);
-
-	bounce_end_io(bio, pool, err);
-}
-
-static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	__bounce_end_io_read(bio, page_pool, err);
-	return 0;
-}
-
-static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err)
-{
-	if (bio->bi_size)
-		return 1;
-
-	__bounce_end_io_read(bio, isa_page_pool, err);
-	return 0;
-}
-
-static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
-			       mempool_t *pool)
-{
-	struct page *page;
-	struct bio *bio = NULL;
-	int i, rw = bio_data_dir(*bio_orig);
-	struct bio_vec *to, *from;
-
-	bio_for_each_segment(from, *bio_orig, i) {
-		page = from->bv_page;
-
-		/*
-		 * is destination page below bounce pfn?
-		 */
-		if (page_to_pfn(page) < q->bounce_pfn)
-			continue;
-
-		/*
-		 * irk, bounce it
-		 */
-		if (!bio)
-			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
-
-		to = bio->bi_io_vec + i;
-
-		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-		to->bv_len = from->bv_len;
-		to->bv_offset = from->bv_offset;
-		inc_zone_page_state(to->bv_page, NR_BOUNCE);
-
-		if (rw == WRITE) {
-			char *vto, *vfrom;
-
-			flush_dcache_page(from->bv_page);
-			vto = page_address(to->bv_page) + to->bv_offset;
-			vfrom = kmap(from->bv_page) + from->bv_offset;
-			memcpy(vto, vfrom, to->bv_len);
-			kunmap(from->bv_page);
-		}
-	}
-
-	/*
-	 * no pages bounced
-	 */
-	if (!bio)
-		return;
-
-	/*
-	 * at least one page was bounced, fill in possible non-highmem
-	 * pages
-	 */
-	__bio_for_each_segment(from, *bio_orig, i, 0) {
-		to = bio_iovec_idx(bio, i);
-		if (!to->bv_page) {
-			to->bv_page = from->bv_page;
-			to->bv_len = from->bv_len;
-			to->bv_offset = from->bv_offset;
-		}
-	}
-
-	bio->bi_bdev = (*bio_orig)->bi_bdev;
-	bio->bi_flags |= (1 << BIO_BOUNCED);
-	bio->bi_sector = (*bio_orig)->bi_sector;
-	bio->bi_rw = (*bio_orig)->bi_rw;
-
-	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
-	bio->bi_idx = (*bio_orig)->bi_idx;
-	bio->bi_size = (*bio_orig)->bi_size;
-
-	if (pool == page_pool) {
-		bio->bi_end_io = bounce_end_io_write;
-		if (rw == READ)
-			bio->bi_end_io = bounce_end_io_read;
-	} else {
-		bio->bi_end_io = bounce_end_io_write_isa;
-		if (rw == READ)
-			bio->bi_end_io = bounce_end_io_read_isa;
-	}
-
-	bio->bi_private = *bio_orig;
-	*bio_orig = bio;
-}
-
-void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
-{
-	mempool_t *pool;
-
-	/*
-	 * for non-isa bounce case, just check if the bounce pfn is equal
-	 * to or bigger than the highest pfn in the system -- in that case,
-	 * don't waste time iterating over bio segments
-	 */
-	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->bounce_pfn >= blk_max_pfn)
-			return;
-		pool = page_pool;
-	} else {
-		BUG_ON(!isa_page_pool);
-		pool = isa_page_pool;
-	}
-
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
-
-	/*
-	 * slow path
-	 */
-	__blk_queue_bounce(q, bio_orig, pool);
-}
-
-EXPORT_SYMBOL(blk_queue_bounce);
-
 #if defined(HASHED_PAGE_VIRTUAL)
 
 #define PA_HASH_ORDER	7
-- 
cgit v1.2.3


From 811d736f9e8013966e1a5a930c0db09508bdbb15 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 29 Aug 2006 19:06:09 +0100
Subject: [PATCH] BLOCK: Dissociate generic_writepages() from mpage stuff [try
 #6]

Dissociate the generic_writepages() function from the mpage stuff, moving its
declaration to linux/mm.h and actually emitting a full implementation into
mm/page-writeback.c.

The implementation is a partial duplicate of mpage_writepages() with all BIO
references removed.

It is used by NFS to do writeback.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c            |   1 +
 fs/mpage.c                |   2 +
 include/linux/mpage.h     |   6 ---
 include/linux/writeback.h |   2 +
 mm/page-writeback.c       | 134 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 139 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20f7333ba372..335c38bb86eb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
 #include <linux/uio.h>
diff --git a/fs/mpage.c b/fs/mpage.c
index 1e4598247d0b..692a3e578fc8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -693,6 +693,8 @@ out:
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
+ *
+ * If you fix this you should check generic_writepages() also!
  */
 int
 mpage_writepages(struct address_space *mapping,
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 3ca880463c47..517c098fde20 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -20,9 +20,3 @@ int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
 		struct writeback_control *wbc);
-
-static inline int
-generic_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-	return mpage_writepages(mapping, wbc, NULL);
-}
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9d4074ecd0cd..4f4d98addb44 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -111,6 +111,8 @@ balance_dirty_pages_ratelimited(struct address_space *mapping)
 }
 
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+extern int generic_writepages(struct address_space *mapping,
+			      struct writeback_control *wbc);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, loff_t count);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9fdcc7903956..ecf27839c203 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -31,6 +31,7 @@
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -551,6 +552,139 @@ void __init page_writeback_init(void)
 	register_cpu_notifier(&ratelimit_nb);
 }
 
+/**
+ * generic_writepages - walk the list of dirty pages of the given
+ *                      address space and writepage() all of them.
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * If a page is already under I/O, generic_writepages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ *
+ * Derived from mpage_writepages() - if you fix this you should check that
+ * also!
+ */
+int generic_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	int (*writepage)(struct page *page, struct writeback_control *wbc);
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	writepage = mapping->a_ops->writepage;
+
+	/* deal with chardevs and other special file */
+	if (!writepage)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc);
+			if (ret) {
+				if (ret == -ENOSPC)
+					set_bit(AS_ENOSPC, &mapping->flags);
+				else
+					set_bit(AS_EIO, &mapping->flags);
+			}
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+				unlock_page(page);
+			if (ret || (--(wbc->nr_to_write) <= 0))
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_writepages);
+
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	int ret;
-- 
cgit v1.2.3


From 9361401eb7619c033e2394e4f9f6d410d6719ac7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 30 Sep 2006 20:45:40 +0200
Subject: [PATCH] BLOCK: Make it possible to disable the block layer [try #6]

Make it possible to disable the block layer.  Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.

This patch does the following:

 (*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
     support.

 (*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
     an item that uses the block layer.  This includes:

     (*) Block I/O tracing.

     (*) Disk partition code.

     (*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.

     (*) The SCSI layer.  As far as I can tell, even SCSI chardevs use the
     	 block layer to do scheduling.  Some drivers that use SCSI facilities -
     	 such as USB storage - end up disabled indirectly from this.

     (*) Various block-based device drivers, such as IDE and the old CDROM
     	 drivers.

     (*) MTD blockdev handling and FTL.

     (*) JFFS - which uses set_bdev_super(), something it could avoid doing by
     	 taking a leaf out of JFFS2's book.

 (*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
     linux/elevator.h contingent on CONFIG_BLOCK being set.  sector_div() is,
     however, still used in places, and so is still available.

 (*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
     parts of linux/fs.h.

 (*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.

 (*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.

 (*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
     is not enabled.

 (*) fs/no-block.c is created to hold out-of-line stubs and things that are
     required when CONFIG_BLOCK is not set:

     (*) Default blockdev file operations (to give error ENODEV on opening).

 (*) Makes some /proc changes:

     (*) /proc/devices does not list any blockdevs.

     (*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.

 (*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.

 (*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
     given command other than Q_SYNC or if a special device is specified.

 (*) In init/do_mounts.c, no reference is made to the blockdev routines if
     CONFIG_BLOCK is not defined.  This does not prohibit NFS roots or JFFS2.

 (*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
     error ENOSYS by way of cond_syscall if so).

 (*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
     CONFIG_BLOCK is not set, since they can't then happen.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig                | 20 ++++++++++++++++++
 block/Kconfig.iosched        |  3 +++
 block/Makefile               |  2 +-
 drivers/block/Kconfig        |  4 ++++
 drivers/cdrom/Kconfig        |  2 +-
 drivers/char/Kconfig         |  1 +
 drivers/char/random.c        |  4 ++++
 drivers/ide/Kconfig          |  4 ++++
 drivers/md/Kconfig           |  3 +++
 drivers/message/i2o/Kconfig  |  2 +-
 drivers/mmc/Kconfig          |  2 +-
 drivers/mmc/Makefile         |  3 ++-
 drivers/mtd/Kconfig          | 12 +++++------
 drivers/mtd/devices/Kconfig  |  2 +-
 drivers/s390/block/Kconfig   |  2 +-
 drivers/scsi/Kconfig         |  2 ++
 fs/Kconfig                   | 31 ++++++++++++++++++++-------
 fs/Makefile                  | 14 +++++++++----
 fs/compat_ioctl.c            | 18 ++++++++++++++++
 fs/internal.h                |  6 ++++++
 fs/no-block.c                | 22 +++++++++++++++++++
 fs/partitions/Makefile       |  2 +-
 fs/proc/proc_misc.c          | 11 +++++++++-
 fs/quota.c                   | 44 ++++++++++++++++++++++++++------------
 fs/super.c                   |  4 ++++
 fs/xfs/Kconfig               |  1 +
 include/linux/blkdev.h       | 50 +++++++++++++++++++++++++++++++-------------
 include/linux/buffer_head.h  | 16 ++++++++++++++
 include/linux/compat_ioctl.h |  2 ++
 include/linux/elevator.h     |  3 +++
 include/linux/fs.h           | 25 +++++++++++++++++++---
 include/linux/genhd.h        |  4 ++++
 include/linux/mpage.h        |  3 +++
 include/linux/raid/md.h      |  3 +++
 include/linux/raid/md_k.h    |  3 +++
 include/scsi/scsi_tcq.h      |  3 ++-
 init/Kconfig                 |  2 +-
 init/do_mounts.c             | 13 +++++++++++-
 kernel/sys_ni.c              |  5 +++++
 mm/Makefile                  |  2 +-
 mm/filemap.c                 |  4 ++++
 mm/migrate.c                 |  2 ++
 mm/page-writeback.c          |  8 ++++---
 mm/truncate.c                |  2 ++
 44 files changed, 308 insertions(+), 63 deletions(-)
 create mode 100644 fs/no-block.c

(limited to 'mm')

diff --git a/block/Kconfig b/block/Kconfig
index b6f5f0a79655..9af6c614dfde 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -1,6 +1,24 @@
 #
 # Block layer core configuration
 #
+config BLOCK
+       bool "Enable the block layer"
+       default y
+       help
+	 This permits the block layer to be removed from the kernel if it's not
+	 needed (on some embedded devices for example).  If this option is
+	 disabled, then blockdev files will become unusable and some
+	 filesystems (such as ext3) will become unavailable.
+
+	 This option will also disable SCSI character devices and USB storage
+	 since they make use of various block layer definitions and
+	 facilities.
+
+	 Say Y here unless you know you really don't want to mount disks and
+	 suchlike.
+
+if BLOCK
+
 #XXX - it makes sense to enable this only for 32-bit subarch's, not for x86_64
 #for instance.
 config LBD
@@ -33,4 +51,6 @@ config LSF
 
 	  If unsure, say Y.
 
+endif
+
 source block/Kconfig.iosched
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 48d090e266fc..903f0d3b6852 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -1,3 +1,4 @@
+if BLOCK
 
 menu "IO Schedulers"
 
@@ -67,3 +68,5 @@ config DEFAULT_IOSCHED
 	default "noop" if DEFAULT_NOOP
 
 endmenu
+
+endif
diff --git a/block/Makefile b/block/Makefile
index c05de0e0037f..4b84d0d5947b 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the kernel block layer
 #
 
-obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
+obj-$(CONFIG_BLOCK) := elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b5382cedf0c0..422e31d5f8e5 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -2,6 +2,8 @@
 # Block device driver configuration
 #
 
+if BLOCK
+
 menu "Block devices"
 
 config BLK_DEV_FD
@@ -468,3 +470,5 @@ config ATA_OVER_ETH
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
 endmenu
+
+endif
diff --git a/drivers/cdrom/Kconfig b/drivers/cdrom/Kconfig
index ff5652d40619..4b12e9031fb3 100644
--- a/drivers/cdrom/Kconfig
+++ b/drivers/cdrom/Kconfig
@@ -3,7 +3,7 @@
 #
 
 menu "Old CD-ROM drivers (not SCSI, not IDE)"
-	depends on ISA
+	depends on ISA && BLOCK
 
 config CD_NO_IDESCSI
 	bool "Support non-SCSI/IDE/ATAPI CDROM drives"
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 4cc619edf424..bde1c665d9f4 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -1006,6 +1006,7 @@ config GPIO_VR41XX
 
 config RAW_DRIVER
 	tristate "RAW driver (/dev/raw/rawN) (OBSOLETE)"
+	depends on BLOCK
 	help
 	  The raw driver permits block devices to be bound to /dev/raw/rawN. 
 	  Once bound, I/O against /dev/raw/rawN uses efficient zero-copy I/O. 
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4c3a5ca9d8f7..b430a12eb819 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -655,6 +655,7 @@ void add_interrupt_randomness(int irq)
 	add_timer_randomness(irq_timer_state[irq], 0x100 + irq);
 }
 
+#ifdef CONFIG_BLOCK
 void add_disk_randomness(struct gendisk *disk)
 {
 	if (!disk || !disk->random)
@@ -667,6 +668,7 @@ void add_disk_randomness(struct gendisk *disk)
 }
 
 EXPORT_SYMBOL(add_disk_randomness);
+#endif
 
 #define EXTRACT_SIZE 10
 
@@ -918,6 +920,7 @@ void rand_initialize_irq(int irq)
 	}
 }
 
+#ifdef CONFIG_BLOCK
 void rand_initialize_disk(struct gendisk *disk)
 {
 	struct timer_rand_state *state;
@@ -932,6 +935,7 @@ void rand_initialize_disk(struct gendisk *disk)
 		disk->random = state;
 	}
 }
+#endif
 
 static ssize_t
 random_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index b6fb167e20f6..69d627bd537a 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -4,6 +4,8 @@
 # Andre Hedrick <andre@linux-ide.org>
 #
 
+if BLOCK
+
 menu "ATA/ATAPI/MFM/RLL support"
 
 config IDE
@@ -1082,3 +1084,5 @@ config BLK_DEV_HD
 endif
 
 endmenu
+
+endif
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed03eed..6dd31a291d84 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -2,6 +2,8 @@
 # Block device driver configuration
 #
 
+if BLOCK
+
 menu "Multi-device support (RAID and LVM)"
 
 config MD
@@ -251,3 +253,4 @@ config DM_MULTIPATH_EMC
 
 endmenu
 
+endif
diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index fef677103880..6443392bffff 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -88,7 +88,7 @@ config I2O_BUS
 
 config I2O_BLOCK
 	tristate "I2O Block OSM"
-	depends on I2O
+	depends on I2O && BLOCK
 	---help---
 	  Include support for the I2O Block OSM. The Block OSM presents disk
 	  and other structured block devices to the operating system. If you
diff --git a/drivers/mmc/Kconfig b/drivers/mmc/Kconfig
index 45bcf098e762..f540bd88dc5a 100644
--- a/drivers/mmc/Kconfig
+++ b/drivers/mmc/Kconfig
@@ -21,7 +21,7 @@ config MMC_DEBUG
 
 config MMC_BLOCK
 	tristate "MMC block device driver"
-	depends on MMC
+	depends on MMC && BLOCK
 	default y
 	help
 	  Say Y here to enable the MMC block device driver support.
diff --git a/drivers/mmc/Makefile b/drivers/mmc/Makefile
index d2957e35cc6f..b1f6e03e7aa9 100644
--- a/drivers/mmc/Makefile
+++ b/drivers/mmc/Makefile
@@ -24,7 +24,8 @@ obj-$(CONFIG_MMC_AU1X)		+= au1xmmc.o
 obj-$(CONFIG_MMC_OMAP)		+= omap.o
 obj-$(CONFIG_MMC_AT91RM9200)	+= at91_mci.o
 
-mmc_core-y := mmc.o mmc_queue.o mmc_sysfs.o
+mmc_core-y := mmc.o mmc_sysfs.o
+mmc_core-$(CONFIG_BLOCK) += mmc_queue.o
 
 ifeq ($(CONFIG_MMC_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index a03e862851db..a304b34c2632 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -166,7 +166,7 @@ config MTD_CHAR
 
 config MTD_BLOCK
 	tristate "Caching block device access to MTD devices"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  Although most flash chips have an erase size too large to be useful
 	  as block devices, it is possible to use MTD devices which are based
@@ -188,7 +188,7 @@ config MTD_BLOCK
 
 config MTD_BLOCK_RO
 	tristate "Readonly block device access to MTD devices"
-	depends on MTD_BLOCK!=y && MTD
+	depends on MTD_BLOCK!=y && MTD && BLOCK
 	help
 	  This allows you to mount read-only file systems (such as cramfs)
 	  from an MTD device, without the overhead (and danger) of the caching
@@ -199,7 +199,7 @@ config MTD_BLOCK_RO
 
 config FTL
 	tristate "FTL (Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the original Flash Translation Layer which
 	  is part of the PCMCIA specification. It uses a kind of pseudo-
@@ -215,7 +215,7 @@ config FTL
 
 config NFTL
 	tristate "NFTL (NAND Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the NAND Flash Translation Layer which is
 	  used on M-Systems' DiskOnChip devices. It uses a kind of pseudo-
@@ -238,7 +238,7 @@ config NFTL_RW
 
 config INFTL
 	tristate "INFTL (Inverse NAND Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the Inverse NAND Flash Translation
 	  Layer which is used on M-Systems' newer DiskOnChip devices. It
@@ -255,7 +255,7 @@ config INFTL
 
 config RFD_FTL
         tristate "Resident Flash Disk (Flash Translation Layer) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	---help---
 	  This provides support for the flash translation layer known
 	  as the Resident Flash Disk (RFD), as used by the Embedded BIOS
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 16c02b5ccf7e..440f6851da69 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -136,7 +136,7 @@ config MTDRAM_ABS_POS
 
 config MTD_BLOCK2MTD
 	tristate "MTD using block device"
-	depends on MTD
+	depends on MTD && BLOCK
 	help
 	  This driver allows a block device to appear as an MTD. It would
 	  generally be used in the following cases:
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 929d6fff6152..b250c5354503 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -1,4 +1,4 @@
-if S390
+if S390 && BLOCK
 
 comment "S/390 block device drivers"
 	depends on S390
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index c4dfcc91ddda..dab082002e6f 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -3,11 +3,13 @@ menu "SCSI device support"
 config RAID_ATTRS
 	tristate "RAID Transport Class"
 	default n
+	depends on BLOCK
 	---help---
 	  Provides RAID
 
 config SCSI
 	tristate "SCSI device support"
+	depends on BLOCK
 	---help---
 	  If you want to use a SCSI hard disk, SCSI tape drive, SCSI CD-ROM or
 	  any other SCSI device under Linux, say Y and make sure that you know
diff --git a/fs/Kconfig b/fs/Kconfig
index 4fd9efac29ab..1453d2d164f7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,8 @@
 
 menu "File systems"
 
+if BLOCK
+
 config EXT2_FS
 	tristate "Second extended fs support"
 	help
@@ -399,6 +401,8 @@ config ROMFS_FS
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
 
+endif
+
 config INOTIFY
 	bool "Inotify file change notification support"
 	default y
@@ -530,6 +534,7 @@ config FUSE_FS
 	  If you want to develop a userspace FS, or if you want to use
 	  a filesystem based on FUSE, answer Y or M.
 
+if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
@@ -597,7 +602,9 @@ config UDF_NLS
 	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
 
 endmenu
+endif
 
+if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 config FAT_FS
@@ -782,6 +789,7 @@ config NTFS_RW
 	  It is perfectly safe to say N here.
 
 endmenu
+endif
 
 menu "Pseudo filesystems"
 
@@ -939,7 +947,7 @@ menu "Miscellaneous filesystems"
 
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
@@ -967,7 +975,7 @@ config ADFS_FS_RW
 
 config AFFS_FS
 	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  The Fast File System (FFS) is the common file system used on hard
 	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
@@ -989,7 +997,7 @@ config AFFS_FS
 
 config HFS_FS
 	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	select NLS
 	help
 	  If you say Y here, you will be able to mount Macintosh-formatted
@@ -1002,6 +1010,7 @@ config HFS_FS
 
 config HFSPLUS_FS
 	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
 	select NLS
 	select NLS_UTF8
 	help
@@ -1015,7 +1024,7 @@ config HFSPLUS_FS
 
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
@@ -1042,7 +1051,7 @@ config BEFS_DEBUG
 
 config BFS_FS
 	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
@@ -1064,7 +1073,7 @@ config BFS_FS
 
 config EFS_FS
 	tristate "EFS file system support (read only) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
 	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
@@ -1079,7 +1088,7 @@ config EFS_FS
 
 config JFFS_FS
 	tristate "Journalling Flash File System (JFFS) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	help
 	  JFFS is the Journaling Flash File System developed by Axis
 	  Communications in Sweden, aimed at providing a crash/powerdown-safe
@@ -1264,6 +1273,7 @@ endchoice
 
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
+	depends on BLOCK
 	select ZLIB_INFLATE
 	help
 	  Saying Y here includes support for CramFs (Compressed ROM File
@@ -1283,6 +1293,7 @@ config CRAMFS
 
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
 	help
 	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
 	  file system format.  VERITAS VxFS(TM) is the standard file system
@@ -1300,6 +1311,7 @@ config VXFS_FS
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
 	  is the file system used for organizing files on OS/2 hard disk
@@ -1316,6 +1328,7 @@ config HPFS_FS
 
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
@@ -1343,6 +1356,7 @@ config QNX4FS_RW
 
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
 	help
 	  SCO, Xenix and Coherent are commercial Unix systems for Intel
 	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
@@ -1381,6 +1395,7 @@ config SYSV_FS
 
 config UFS_FS
 	tristate "UFS file system support (read only)"
+	depends on BLOCK
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
@@ -1959,11 +1974,13 @@ config GENERIC_ACL
 
 endmenu
 
+if BLOCK
 menu "Partition Types"
 
 source "fs/partitions/Kconfig"
 
 endmenu
+endif
 
 source "fs/nls/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 46b8cfe497b2..a503e6ce0f32 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -5,12 +5,18 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
-obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
-		block_dev.o char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
+obj-y :=	open.o read_write.o file_table.o super.o \
+		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
-		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o drop_caches.o splice.o sync.o
+		seq_file.o xattr.o libfs.o fs-writeback.o \
+		pnode.o drop_caches.o splice.o sync.o
+
+ifeq ($(CONFIG_BLOCK),y)
+obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+else
+obj-y +=	no-block.o
+endif
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e1a56437040a..64b34533edea 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -645,6 +645,7 @@ out:
 }
 #endif
 
+#ifdef CONFIG_BLOCK
 struct hd_geometry32 {
 	unsigned char heads;
 	unsigned char sectors;
@@ -869,6 +870,7 @@ static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	}
 	return err;
 }
+#endif /* CONFIG_BLOCK */
 
 struct sock_fprog32 {
 	unsigned short	len;
@@ -992,6 +994,7 @@ static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 }
 
 
+#ifdef CONFIG_BLOCK
 struct mtget32 {
 	compat_long_t	mt_type;
 	compat_long_t	mt_resid;
@@ -1164,6 +1167,7 @@ static int cdrom_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar
 
 	return err;
 }
+#endif /* CONFIG_BLOCK */
 
 #ifdef CONFIG_VT
 
@@ -1491,6 +1495,7 @@ ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
+#ifdef CONFIG_BLOCK
 static int broken_blkgetsize(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	/* The mkswap binary hard codes it to Intel value :-((( */
@@ -1525,12 +1530,14 @@ static int blkpg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar
 
 	return sys_ioctl(fd, cmd, (unsigned long)a);
 }
+#endif
 
 static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
 }
 
+#ifdef CONFIG_BLOCK
 /* Fix sizeof(sizeof()) breakage */
 #define BLKBSZGET_32   _IOR(0x12,112,int)
 #define BLKBSZSET_32   _IOW(0x12,113,int)
@@ -1551,6 +1558,7 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
 {
        return sys_ioctl(fd, BLKGETSIZE64, (unsigned long)compat_ptr(arg));
 }
+#endif
 
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO	_IOW('U', 200, int)
@@ -1571,6 +1579,7 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST	_IOR('H', 210, int)
 #define HIDPGETCONNINFO	_IOR('H', 211, int)
 
+#ifdef CONFIG_BLOCK
 struct floppy_struct32 {
 	compat_uint_t	size;
 	compat_uint_t	sect;
@@ -1895,6 +1904,7 @@ out:
 	kfree(karg);
 	return err;
 }
+#endif
 
 struct mtd_oob_buf32 {
 	u_int32_t start;
@@ -1936,6 +1946,7 @@ static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }	
 
+#ifdef CONFIG_BLOCK
 struct raw32_config_request
 {
         compat_int_t    raw_minor;
@@ -2000,6 +2011,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
         }
         return ret;
 }
+#endif /* CONFIG_BLOCK */
 
 struct serial_struct32 {
         compat_int_t    type;
@@ -2606,6 +2618,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
 HANDLE_IOCTL(SIOCRTMSG, ret_einval)
 HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
 #endif
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
 HANDLE_IOCTL(BLKRAGET, w_long)
 HANDLE_IOCTL(BLKGETSIZE, w_long)
@@ -2631,14 +2644,17 @@ HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
 HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
 HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
+#endif
 HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
 HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
 HANDLE_IOCTL(CDROMREADAUDIO, cdrom_ioctl_trans)
 HANDLE_IOCTL(CDROM_SEND_PACKET, cdrom_ioctl_trans)
+#endif
 #define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
 HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
 #ifdef CONFIG_VT
@@ -2677,12 +2693,14 @@ HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
 HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
 HANDLE_IOCTL(BLKGETSIZE64_32, do_blkgetsize64)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
+#endif
 /* Serial */
 HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
 HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
diff --git a/fs/internal.h b/fs/internal.h
index f662b703bb97..f07147d63255 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,10 +14,16 @@
 /*
  * block_dev.c
  */
+#ifdef CONFIG_BLOCK
 extern struct super_block *blockdev_superblock;
 extern void __init bdev_cache_init(void);
 
 #define sb_is_blkdev_sb(sb) ((sb) == blockdev_superblock)
+#else
+static inline void bdev_cache_init(void) {}
+
+#define sb_is_blkdev_sb(sb) 0
+#endif
 
 /*
  * char_dev.c
diff --git a/fs/no-block.c b/fs/no-block.c
new file mode 100644
index 000000000000..d269a93d3467
--- /dev/null
+++ b/fs/no-block.c
@@ -0,0 +1,22 @@
+/* no-block.c: implementation of routines required for non-BLOCK configuration
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+static int no_blkdev_open(struct inode * inode, struct file * filp)
+{
+	return -ENODEV;
+}
+
+const struct file_operations def_blk_fops = {
+	.open		= no_blkdev_open,
+};
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index d713ce6b3e12..67e665fdb7fc 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-obj-y := check.o
+obj-$(CONFIG_BLOCK) := check.o
 
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5bbd60896050..66bc425f2f3d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -277,12 +277,15 @@ static int devinfo_show(struct seq_file *f, void *v)
 		if (i == 0)
 			seq_printf(f, "Character devices:\n");
 		chrdev_show(f, i);
-	} else {
+	}
+#ifdef CONFIG_BLOCK
+	else {
 		i -= CHRDEV_MAJOR_HASH_SIZE;
 		if (i == 0)
 			seq_printf(f, "\nBlock devices:\n");
 		blkdev_show(f, i);
 	}
+#endif
 	return 0;
 }
 
@@ -355,6 +358,7 @@ static int stram_read_proc(char *page, char **start, off_t off,
 }
 #endif
 
+#ifdef CONFIG_BLOCK
 extern struct seq_operations partitions_op;
 static int partitions_open(struct inode *inode, struct file *file)
 {
@@ -378,6 +382,7 @@ static struct file_operations proc_diskstats_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 #ifdef CONFIG_MODULES
 extern struct seq_operations modules_op;
@@ -695,7 +700,9 @@ void __init proc_misc_init(void)
 		entry->proc_fops = &proc_kmsg_operations;
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
+#ifdef CONFIG_BLOCK
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
+#endif
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 #ifdef CONFIG_SLAB
@@ -707,7 +714,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
+#ifdef CONFIG_BLOCK
 	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
+#endif
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 #endif
diff --git a/fs/quota.c b/fs/quota.c
index d6a2be826e29..b9dae76a0b6e 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -337,6 +337,34 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 	return 0;
 }
 
+/*
+ * look up a superblock on which quota ops will be performed
+ * - use the name of a block device to find the superblock thereon
+ */
+static inline struct super_block *quotactl_block(const char __user *special)
+{
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+	struct super_block *sb;
+	char *tmp = getname(special);
+
+	if (IS_ERR(tmp))
+		return ERR_PTR(PTR_ERR(tmp));
+	bdev = lookup_bdev(tmp);
+	putname(tmp);
+	if (IS_ERR(bdev))
+		return ERR_PTR(PTR_ERR(bdev));
+	sb = get_super(bdev);
+	bdput(bdev);
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+
+	return sb;
+#else
+	return ERR_PTR(-ENODEV);
+#endif
+}
+
 /*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
@@ -347,25 +375,15 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
-	struct block_device *bdev;
-	char *tmp;
 	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
 	if (cmds != Q_SYNC || special) {
-		tmp = getname(special);
-		if (IS_ERR(tmp))
-			return PTR_ERR(tmp);
-		bdev = lookup_bdev(tmp);
-		putname(tmp);
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
-		sb = get_super(bdev);
-		bdput(bdev);
-		if (!sb)
-			return -ENODEV;
+		sb = quotactl_block(special);
+		if (IS_ERR(sb))
+			return PTR_ERR(sb);
 	}
 
 	ret = check_quotactl_valid(sb, type, cmds, id);
diff --git a/fs/super.c b/fs/super.c
index 15671cd048b1..aec99ddbe53f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -571,8 +571,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
 	
+#ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
 		return -EACCES;
+#endif
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
@@ -692,6 +694,7 @@ void kill_litter_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_litter_super);
 
+#ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
@@ -787,6 +790,7 @@ void kill_block_super(struct super_block *sb)
 }
 
 EXPORT_SYMBOL(kill_block_super);
+#endif
 
 int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 26b364c9d62c..35115bca036e 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,5 +1,6 @@
 config XFS_FS
 	tristate "XFS filesystem support"
+	depends on BLOCK
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c01a90998a7..3e36107d342a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,22 @@
 
 #include <asm/scatterlist.h>
 
+#ifdef CONFIG_LBD
+# include <asm/div64.h>
+# define sector_div(a, b) do_div(a, b)
+#else
+# define sector_div(n, b)( \
+{ \
+	int _res; \
+	_res = (n) % (b); \
+	(n) /= (b); \
+	_res; \
+} \
+)
+#endif
+
+#ifdef CONFIG_BLOCK
+
 struct scsi_ioctl_command;
 
 struct request_queue;
@@ -818,24 +834,30 @@ struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
 void kblockd_flush(void);
 
-#ifdef CONFIG_LBD
-# include <asm/div64.h>
-# define sector_div(a, b) do_div(a, b)
-#else
-# define sector_div(n, b)( \
-{ \
-	int _res; \
-	_res = (n) % (b); \
-	(n) /= (b); \
-	_res; \
-} \
-)
-#endif 
-
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
 
+#else /* CONFIG_BLOCK */
+/*
+ * stubs for when the block layer is configured out
+ */
+#define buffer_heads_over_limit 0
+
+static inline long blk_congestion_wait(int rw, long timeout)
+{
+	return timeout;
+}
+
+static inline long nr_blockdev_pages(void)
+{
+	return 0;
+}
+
+static inline void exit_io_context(void) {}
+
+#endif /* CONFIG_BLOCK */
+
 #endif
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 64b508e35d2a..131ffd37e716 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -14,6 +14,8 @@
 #include <linux/wait.h>
 #include <asm/atomic.h>
 
+#ifdef CONFIG_BLOCK
+
 enum bh_state_bits {
 	BH_Uptodate,	/* Contains valid data */
 	BH_Dirty,	/* Is dirty */
@@ -301,4 +303,18 @@ static inline void lock_buffer(struct buffer_head *bh)
 }
 
 extern int __set_page_dirty_buffers(struct page *page);
+
+#else /* CONFIG_BLOCK */
+
+static inline void buffer_init(void) {}
+static inline int try_to_free_buffers(struct page *page) { return 1; }
+static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline int inode_has_buffers(struct inode *inode) { return 0; }
+static inline void invalidate_inode_buffers(struct inode *inode) {}
+static inline int remove_inode_buffers(struct inode *inode) { return 1; }
+static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
+static inline void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) {}
+
+
+#endif /* CONFIG_BLOCK */
 #endif /* _LINUX_BUFFER_HEAD_H */
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 98d40e08ba6e..d61ef5951538 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -90,6 +90,7 @@ COMPATIBLE_IOCTL(FDTWADDLE)
 COMPATIBLE_IOCTL(FDFMTTRK)
 COMPATIBLE_IOCTL(FDRAWCMD)
 /* 0x12 */
+#ifdef CONFIG_BLOCK
 COMPATIBLE_IOCTL(BLKRASET)
 COMPATIBLE_IOCTL(BLKROSET)
 COMPATIBLE_IOCTL(BLKROGET)
@@ -103,6 +104,7 @@ COMPATIBLE_IOCTL(BLKTRACESETUP)
 COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
+#endif
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 9c5a04f6114c..b3370ef5164d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -3,6 +3,8 @@
 
 #include <linux/percpu.h>
 
+#ifdef CONFIG_BLOCK
+
 typedef int (elevator_merge_fn) (request_queue_t *, struct request **,
 				 struct bio *);
 
@@ -203,4 +205,5 @@ enum {
 	__val;							\
 })
 
+#endif /* CONFIG_BLOCK */
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b73a47582dbe..5baf3a153403 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1482,6 +1482,7 @@ extern void __init vfs_caches_init(unsigned long);
 extern void putname(const char *name);
 #endif
 
+#ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
 extern int unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
@@ -1490,11 +1491,15 @@ extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern struct block_device *open_by_devnum(dev_t, unsigned);
 extern struct block_device *open_partition_by_devnum(dev_t, unsigned);
-extern const struct file_operations def_blk_fops;
 extern const struct address_space_operations def_blk_aops;
+#else
+static inline void bd_forget(struct inode *inode) {}
+#endif
+extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
 extern const struct file_operations def_fifo_fops;
+#ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
@@ -1510,6 +1515,7 @@ extern void bd_release_from_disk(struct block_device *, struct gendisk *);
 #define bd_claim_by_disk(bdev, holder, disk)	bd_claim(bdev, holder)
 #define bd_release_from_disk(bdev, disk)	bd_release(bdev)
 #endif
+#endif
 
 /* fs/char_dev.c */
 #define CHRDEV_MAJOR_HASH_SIZE	255
@@ -1523,14 +1529,19 @@ extern int chrdev_open(struct inode *, struct file *);
 extern void chrdev_show(struct seq_file *,off_t);
 
 /* fs/block_dev.c */
-#define BLKDEV_MAJOR_HASH_SIZE	255
 #define BDEVNAME_SIZE	32	/* Largest string for a blockdev identifier */
+
+#ifdef CONFIG_BLOCK
+#define BLKDEV_MAJOR_HASH_SIZE	255
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_excl(const char *, int, void *);
 extern void close_bdev_excl(struct block_device *);
 extern void blkdev_show(struct seq_file *,off_t);
+#else
+#define BLKDEV_MAJOR_HASH_SIZE	0
+#endif
 
 extern void init_special_inode(struct inode *, umode_t, dev_t);
 
@@ -1544,6 +1555,7 @@ extern const struct file_operations rdwr_fifo_fops;
 
 extern int fs_may_remount_ro(struct super_block *);
 
+#ifdef CONFIG_BLOCK
 /*
  * return READ, READA, or WRITE
  */
@@ -1555,9 +1567,10 @@ extern int fs_may_remount_ro(struct super_block *);
 #define bio_data_dir(bio)	((bio)->bi_rw & 1)
 
 extern int check_disk_change(struct block_device *);
-extern int invalidate_inodes(struct super_block *);
 extern int __invalidate_device(struct block_device *);
 extern int invalidate_partition(struct gendisk *, int);
+#endif
+extern int invalidate_inodes(struct super_block *);
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
 					pgoff_t start, pgoff_t end);
 unsigned long invalidate_inode_pages(struct address_space *mapping);
@@ -1590,7 +1603,9 @@ extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
 			 void *data, int force);
+#ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
+#endif
 extern int notify_change(struct dentry *, struct iattr *);
 extern int permission(struct inode *, int, struct nameidata *);
 extern int generic_permission(struct inode *, int,
@@ -1673,9 +1688,11 @@ static inline void insert_inode_hash(struct inode *inode) {
 extern struct file * get_empty_filp(void);
 extern void file_move(struct file *f, struct list_head *list);
 extern void file_kill(struct file *f);
+#ifdef CONFIG_BLOCK
 struct bio;
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
+#endif
 extern int set_blocksize(struct block_device *, int);
 extern int sb_set_blocksize(struct super_block *, int);
 extern int sb_min_blocksize(struct super_block *, int);
@@ -1756,6 +1773,7 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
 				actor);
 }
 
+#ifdef CONFIG_BLOCK
 ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset,
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
@@ -1793,6 +1811,7 @@ static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
 				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
 }
+#endif
 
 extern const struct file_operations generic_ro_fops;
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e4af57e87c17..41f276fdd185 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -11,6 +11,8 @@
 
 #include <linux/types.h>
 
+#ifdef CONFIG_BLOCK
+
 enum {
 /* These three have identical behaviour; use the second one if DOS FDISK gets
    confused about extended/logical partitions starting past cylinder 1023. */
@@ -420,3 +422,5 @@ static inline struct block_device *bdget_disk(struct gendisk *disk, int index)
 #endif
 
 #endif
+
+#endif
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 517c098fde20..cc5fb75af78a 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -9,6 +9,7 @@
  * (And no, it doesn't do the #ifdef __MPAGE_H thing, and it doesn't do
  * nested includes.  Get it right in the .c file).
  */
+#ifdef CONFIG_BLOCK
 
 struct writeback_control;
 typedef int (writepage_t)(struct page *page, struct writeback_control *wbc);
@@ -20,3 +21,5 @@ int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
 		struct writeback_control *wbc);
+
+#endif
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index eb3e547c8fee..c588709acbbc 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -53,6 +53,8 @@
 #include <linux/raid/md_u.h>
 #include <linux/raid/md_k.h>
 
+#ifdef CONFIG_MD
+
 /*
  * Different major versions are not compatible.
  * Different minor versions are only downward compatible.
@@ -95,5 +97,6 @@ extern void md_new_event(mddev_t *mddev);
 
 extern void md_update_sb(mddev_t * mddev);
 
+#endif /* CONFIG_MD */
 #endif 
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index d28890295852..920b94fe31fa 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -18,6 +18,8 @@
 /* and dm-bio-list.h is not under include/linux because.... ??? */
 #include "../../../drivers/md/dm-bio-list.h"
 
+#ifdef CONFIG_BLOCK
+
 #define	LEVEL_MULTIPATH		(-4)
 #define	LEVEL_LINEAR		(-1)
 #define	LEVEL_FAULTY		(-5)
@@ -362,5 +364,6 @@ static inline void safe_put_page(struct page *p)
 	if (p) put_page(p);
 }
 
+#endif /* CONFIG_BLOCK */
 #endif
 
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index bbf66219b769..c247a28259bc 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -6,7 +6,6 @@
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 
-
 #define MSG_SIMPLE_TAG	0x20
 #define MSG_HEAD_TAG	0x21
 #define MSG_ORDERED_TAG	0x22
@@ -14,6 +13,7 @@
 #define SCSI_NO_TAG	(-1)    /* identify no tag in use */
 
 
+#ifdef CONFIG_BLOCK
 
 /**
  * scsi_get_tag_type - get the type of tag the device supports
@@ -144,4 +144,5 @@ static inline int scsi_init_shared_tag_map(struct Scsi_Host *shost, int depth)
 	return shost->bqt ? 0 : -ENOMEM;
 }
 
+#endif /* CONFIG_BLOCK */
 #endif /* _SCSI_SCSI_TCQ_H */
diff --git a/init/Kconfig b/init/Kconfig
index 4381006dd666..d2eb7a84a264 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -92,7 +92,7 @@ config LOCALVERSION_AUTO
 
 config SWAP
 	bool "Support for paging of anonymous memory (swap)"
-	depends on MMU
+	depends on MMU && BLOCK
 	default y
 	help
 	  This option allows you to choose whether you want to have support
diff --git a/init/do_mounts.c b/init/do_mounts.c
index b290aadb1d3f..dc1ec0803ef9 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -285,7 +285,11 @@ void __init mount_block_root(char *name, int flags)
 {
 	char *fs_names = __getname();
 	char *p;
+#ifdef CONFIG_BLOCK
 	char b[BDEVNAME_SIZE];
+#else
+	const char *b = name;
+#endif
 
 	get_fs_names(fs_names);
 retry:
@@ -304,7 +308,9 @@ retry:
 		 * Allow the user to distinguish between failed sys_open
 		 * and bad superblock on root device.
 		 */
+#ifdef CONFIG_BLOCK
 		__bdevname(ROOT_DEV, b);
+#endif
 		printk("VFS: Cannot open root device \"%s\" or %s\n",
 				root_device_name, b);
 		printk("Please append a correct \"root=\" boot option\n");
@@ -316,7 +322,10 @@ retry:
 	for (p = fs_names; *p; p += strlen(p)+1)
 		printk(" %s", p);
 	printk("\n");
-	panic("VFS: Unable to mount root fs on %s", __bdevname(ROOT_DEV, b));
+#ifdef CONFIG_BLOCK
+	__bdevname(ROOT_DEV, b);
+#endif
+	panic("VFS: Unable to mount root fs on %s", b);
 out:
 	putname(fs_names);
 }
@@ -387,8 +396,10 @@ void __init mount_root(void)
 			change_floppy("root floppy");
 	}
 #endif
+#ifdef CONFIG_BLOCK
 	create_dev("/dev/root", ROOT_DEV);
 	mount_block_root("/dev/root", root_mountflags);
+#endif
 }
 
 /*
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6991bece67e8..7a3b2e75f040 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -134,3 +134,8 @@ cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
 cond_syscall(compat_sys_move_pages);
+
+/* block-layer dependent */
+cond_syscall(sys_bdflush);
+cond_syscall(sys_ioprio_set);
+cond_syscall(sys_ioprio_get);
diff --git a/mm/Makefile b/mm/Makefile
index 4f2166a833b9..12b3a4eee88d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,7 +12,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o \
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
-ifeq ($(CONFIG_MMU),y)
+ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y			+= bounce.o
 endif
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
diff --git a/mm/filemap.c b/mm/filemap.c
index d6846de08887..c4fe97f5ace0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2020,6 +2020,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 		if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
 			*count = inode->i_sb->s_maxbytes - *pos;
 	} else {
+#ifdef CONFIG_BLOCK
 		loff_t isize;
 		if (bdev_read_only(I_BDEV(inode)))
 			return -EPERM;
@@ -2031,6 +2032,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 
 		if (*pos + *count > isize)
 			*count = isize - *pos;
+#else
+		return -EPERM;
+#endif
 	}
 	return 0;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 7f50e3ff54cd..ba2453f9483d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(migrate_page);
 
+#ifdef CONFIG_BLOCK
 /*
  * Migration function for pages with buffers. This function can only be used
  * if the underlying filesystem guarantees that no other references to "page"
@@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping,
 	return 0;
 }
 EXPORT_SYMBOL(buffer_migrate_page);
+#endif
 
 /*
  * Writeback a page to clean the dirty state
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ecf27839c203..c0d4ce144dec 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -807,9 +807,11 @@ int fastcall set_page_dirty(struct page *page)
 
 	if (likely(mapping)) {
 		int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
-		if (spd)
-			return (*spd)(page);
-		return __set_page_dirty_buffers(page);
+#ifdef CONFIG_BLOCK
+		if (!spd)
+			spd = __set_page_dirty_buffers;
+#endif
+		return (*spd)(page);
 	}
 	if (!PageDirty(page)) {
 		if (!TestSetPageDirty(page))
diff --git a/mm/truncate.c b/mm/truncate.c
index cd3e34b816db..8fde6580657e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -35,8 +35,10 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 {
 	void (*invalidatepage)(struct page *, unsigned long);
 	invalidatepage = page->mapping->a_ops->invalidatepage;
+#ifdef CONFIG_BLOCK
 	if (!invalidatepage)
 		invalidatepage = block_invalidatepage;
+#endif
 	if (invalidatepage)
 		(*invalidatepage)(page, offset);
 }
-- 
cgit v1.2.3


From 3fcd03e07008ec0f667dfb7626171165699ea5c2 Mon Sep 17 00:00:00 2001
From: Gavin Lambert <gavinl@compacsort.com>
Date: Sat, 30 Sep 2006 23:27:01 -0700
Subject: [PATCH] NOMMU: don't try and give NULL to fput()

Don't try and give NULL to fput() in the error handling in do_mmap_pgoff()
as it'll cause an oops.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/nommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 564540662192..365019599df8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -948,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 	up_write(&nommu_vma_sem);
 	kfree(vml);
 	if (vma) {
-		fput(vma->vm_file);
+		if (vma->vm_file)
+			fput(vma->vm_file);
 		kfree(vma);
 	}
 	return ret;
-- 
cgit v1.2.3


From f28c5edc06ecd8068b38b7662ad19f4d20d741af Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:04 -0700
Subject: [PATCH] hot-add-mem x86_64: fixup externs

Fix up externs in memory_hotplug.c.  Cleanup.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory_hotplug.h | 2 ++
 include/linux/mm.h             | 2 ++
 mm/memory_hotplug.c            | 4 ----
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 218501cfaeb9..7b54666cea8e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -172,5 +172,7 @@ static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
+extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+								int nr_pages);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4edf1934e5ca..b7966ab8cb6a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -946,6 +946,8 @@ extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+extern void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
+					unsigned long pfn, unsigned long size);
 
 #ifdef CONFIG_NUMA
 extern void setup_per_cpu_pageset(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2053bb165a21..63b14d4f68fb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,8 +26,6 @@
 
 #include <asm/tlbflush.h>
 
-extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
-			  unsigned long size);
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
@@ -47,8 +45,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
-				  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
 	int nr_pages = PAGES_PER_SECTION;
-- 
cgit v1.2.3


From ec69acbb1191df671ff8e07c8e146619a5c53f70 Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:05 -0700
Subject: [PATCH] hot-add-mem x86_64: Kconfig changes

Create Kconfig namespace for MEMORY_HOTPLUG_RESERVE and MEMORY_HOTPLUG_SPARSE.
 This is needed to create a disticiton between the 2 paths.  Selecting the
high level opiton of MEMORY_HOTPLUG will get you MEMORY_HOTPLUG_SPARSE if you
have sparsemem enabled or MEMORY_HOTPLUG_RESERVE if you are x86_64 with
discontig and ACPI numa support.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/Kconfig | 4 ++++
 mm/Kconfig          | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 32ae1378f35c..b87a19f0d584 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -367,6 +367,10 @@ config ARCH_FLATMEM_ENABLE
 
 source "mm/Kconfig"
 
+config MEMORY_HOTPLUG_RESERVE
+	def_bool y
+	depends on (MEMORY_HOTPLUG && DISCONTIGMEM)
+
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool y
 	depends on NUMA
diff --git a/mm/Kconfig b/mm/Kconfig
index 8f5b45615f7b..5d88489ef2de 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
 	bool "Allow for memory hot-add"
-	depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
+	depends on SPARSEMEM || X86_64_ACPI_NUMA
+	depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
 	depends on (IA64 || X86 || PPC64)
 
 comment "Memory hotplug is currently incompatible with Software Suspend"
 	depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
 
+config MEMORY_HOTPLUG_SPARSE
+	def_bool y
+	depends on SPARSEMEM && MEMORY_HOTPLUG
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
cgit v1.2.3


From 53947027ad90542ddb2bb746e3175827c270610a Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:08 -0700
Subject: [PATCH] hot-add-mem x86_64: use CONFIG_MEMORY_HOTPLUG_SPARSE

Migate CONFIG_MEMORY_HOTPLUG to CONFIG_MEMORY_HOTPLUG_SPARSE where needed.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/Makefile  | 2 +-
 include/linux/memory.h | 4 ++--
 mm/memory_hotplug.c    | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index b539e5e75b56..7bbb9eeda235 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -8,7 +8,7 @@ obj-y			+= power/
 obj-$(CONFIG_ISA)	+= isa.o
 obj-$(CONFIG_FW_LOADER)	+= firmware_class.o
 obj-$(CONFIG_NUMA)	+= node.o
-obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o
+obj-$(CONFIG_MEMORY_HOTPLUG_SPARSE) += memory.o
 obj-$(CONFIG_SMP)	+= topology.o
 obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 8f04143ca363..654ef5544878 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -57,7 +57,7 @@ struct memory_block {
 struct notifier_block;
 struct mem_section;
 
-#ifndef CONFIG_MEMORY_HOTPLUG
+#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
 static inline int memory_dev_init(void)
 {
 	return 0;
@@ -78,7 +78,7 @@ extern int remove_memory_block(unsigned long, struct mem_section *, int);
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 
 
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 #define hotplug_memory_notifier(fn, pri) {			\
 	static struct notifier_block fn##_mem_nb =		\
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 63b14d4f68fb..7666dbd328df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
 
 #include <asm/tlbflush.h>
 
+#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
@@ -192,6 +193,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 	writeback_set_ratelimit();
 	return 0;
 }
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
 {
-- 
cgit v1.2.3


From 45e0b78b0532f92c01e363dd4287617c5be4574f Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Sat, 30 Sep 2006 23:27:09 -0700
Subject: [PATCH] hot-add-mem x86_64: use CONFIG_MEMORY_HOTPLUG_RESERVE

The api for hot-add memory already has a construct for finding nodes based on
an address, memory_add_physaddr_to_nid.  This patch allows the fucntion to do
something besides return 0.  It uses the nodes_add infomation to lookup to
node info for a hot add event.

Signed-off-by: Keith Mannthey <kmannth@us.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/init.c | 17 +++++++++++----
 mm/memory_hotplug.c   | 60 +++++++++++++++++++++++++--------------------------
 2 files changed, 43 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 71ecbb3f3262..19c72520a868 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -474,12 +474,12 @@ int arch_add_memory(int nid, u64 start, u64 size)
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
+	init_memory_mapping(start, (start + size -1));
+
 	ret = __add_pages(zone, start_pfn, nr_pages);
 	if (ret)
 		goto error;
 
-	init_memory_mapping(start, (start + size -1));
-
 	return ret;
 error:
 	printk("%s: Problem encountered in __add_pages!\n", __func__);
@@ -501,7 +501,16 @@ int memory_add_physaddr_to_nid(u64 start)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-#else /* CONFIG_MEMORY_HOTPLUG */
+#ifndef CONFIG_ACPI_NUMA
+int memory_add_physaddr_to_nid(u64 start)
+{
+	return 0;
+}
+#endif
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 /*
  * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
  * just online the pages.
@@ -527,7 +536,7 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
 	}
 	return err;
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif
 
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
 			 kcore_vsyscall;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7666dbd328df..fd678a662eae 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,36 @@
 
 #include <asm/tlbflush.h>
 
+/* add this memory to iomem resource */
+static struct resource *register_memory_resource(u64 start, u64 size)
+{
+	struct resource *res;
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+	BUG_ON(!res);
+
+	res->name = "System RAM";
+	res->start = start;
+	res->end = start + size - 1;
+	res->flags = IORESOURCE_MEM;
+	if (request_resource(&iomem_resource, res) < 0) {
+		printk("System RAM resource %llx - %llx cannot be added\n",
+		(unsigned long long)res->start, (unsigned long long)res->end);
+		kfree(res);
+		res = NULL;
+	}
+	return res;
+}
+
+static void release_memory_resource(struct resource *res)
+{
+	if (!res)
+		return;
+	release_resource(res);
+	kfree(res);
+	return;
+}
+
+
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
@@ -223,36 +253,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 	return;
 }
 
-/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
-{
-	struct resource *res;
-	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
-	BUG_ON(!res);
-
-	res->name = "System RAM";
-	res->start = start;
-	res->end = start + size - 1;
-	res->flags = IORESOURCE_MEM;
-	if (request_resource(&iomem_resource, res) < 0) {
-		printk("System RAM resource %llx - %llx cannot be added\n",
-		(unsigned long long)res->start, (unsigned long long)res->end);
-		kfree(res);
-		res = NULL;
-	}
-	return res;
-}
-
-static void release_memory_resource(struct resource *res)
-{
-	if (!res)
-		return;
-	release_resource(res);
-	kfree(res);
-	return;
-}
-
-
 
 int add_memory(int nid, u64 start, u64 size)
 {
-- 
cgit v1.2.3


From 1a2f67b459bb7846d4a15924face63eb2683acc2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 30 Sep 2006 23:27:20 -0700
Subject: [PATCH] kmemdup: introduce

One of idiomatic ways to duplicate a region of memory is

	dst = kmalloc(len, GFP_KERNEL);
	if (!dst)
		return -ENOMEM;
	memcpy(dst, src, len);

which is neat code except a programmer needs to write size twice.  Which
sometimes leads to mistakes.  If len passed to kmalloc is smaller that len
passed to memcpy, it's straight overwrite-beyond-end.  If len passed to
memcpy is smaller than len passed to kmalloc, it's either a) legit
behaviour ;-), or b) cloned buffer will contain garbage in second half.

Slight trolling of commit lists shows several duplications bugs
done exactly because of diverged lenghts:

	Linux:
		[CRYPTO]: Fix memcpy/memset args.
		[PATCH] memcpy/memset fixes
	OpenBSD:
		kerberosV/src/lib/asn1: der_copy.c:1.4

If programmer is given only one place to play with lengths, I believe, such
mistakes could be avoided.

With kmemdup, the snippet above will be rewritten as:

	dst = kmemdup(src, len, GFP_KERNEL);
	if (!dst)
		return -ENOMEM;

This also leads to smaller code (kzalloc effect). Quick grep shows
200+ places where kmemdup() can be used.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/string.h |  1 +
 mm/util.c              | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'mm')

diff --git a/include/linux/string.h b/include/linux/string.h
index e4c755860316..4f69ef9e6eb5 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -99,6 +99,7 @@ extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
 extern char *kstrdup(const char *s, gfp_t gfp);
+extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
 
 #ifdef __cplusplus
 }
diff --git a/mm/util.c b/mm/util.c
index 7368479220b3..e14fa84ef39a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -40,6 +40,24 @@ char *kstrdup(const char *s, gfp_t gfp)
 }
 EXPORT_SYMBOL(kstrdup);
 
+/**
+ * kmemdup - duplicate region of memory
+ *
+ * @src: memory region to duplicate
+ * @len: memory region length
+ * @gfp: GFP mask to use
+ */
+void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+	void *p;
+
+	p = ____kmalloc(len, gfp);
+	if (p)
+		memcpy(p, src, len);
+	return p;
+}
+EXPORT_SYMBOL(kmemdup);
+
 /*
  * strndup_user - duplicate an existing string from user space
  *
-- 
cgit v1.2.3


From 52978be636374c4bfb61220b37fa12f55a071c46 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 30 Sep 2006 23:27:21 -0700
Subject: [PATCH] kmemdup: some users

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c        |  5 ++---
 fs/posix_acl.c                    |  6 ++----
 mm/mempolicy.c                    |  3 +--
 net/rxrpc/transport.c             |  3 +--
 sound/pci/echoaudio/layla24_dsp.c |  4 ++--
 sound/usb/usbaudio.c              | 11 ++++-------
 sound/usb/usbmidi.c               |  3 +--
 7 files changed, 13 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 8c190ca7ae44..96cd0232e1e0 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -425,13 +425,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
-		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+						IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
 			p->thread.io_bitmap_max = 0;
 			return -ENOMEM;
 		}
-		memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
-			IO_BITMAP_BYTES);
 		set_tsk_thread_flag(p, TIF_IO_BITMAP);
 	}
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c8dcf7613fd..aec931e09973 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -58,11 +58,9 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 	if (acl) {
 		int size = sizeof(struct posix_acl) + acl->a_count *
 		           sizeof(struct posix_acl_entry);
-		clone = kmalloc(size, flags);
-		if (clone) {
-			memcpy(clone, acl, size);
+		clone = kmemdup(acl, size, flags);
+		if (clone)
 			atomic_set(&clone->a_refcount, 1);
-		}
 	}
 	return clone;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cf18f0942553..25788b1b7fcf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1324,12 +1324,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
 	atomic_set(&new->refcnt, 1);
 	if (new->policy == MPOL_BIND) {
 		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+		new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
 		if (!new->v.zonelist) {
 			kmem_cache_free(policy_cache, new);
 			return ERR_PTR(-ENOMEM);
 		}
-		memcpy(new->v.zonelist, old->v.zonelist, sz);
 	}
 	return new;
 }
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 465efc86fccf..94b2e2fe6fdb 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -381,11 +381,10 @@ static int rxrpc_incoming_msg(struct rxrpc_transport *trans,
 
 		/* allocate a new message record */
 		ret = -ENOMEM;
-		msg = kmalloc(sizeof(struct rxrpc_message), GFP_KERNEL);
+		msg = kmemdup(jumbomsg, sizeof(struct rxrpc_message), GFP_KERNEL);
 		if (!msg)
 			goto error;
 
-		memcpy(msg, jumbomsg, sizeof(*msg));
 		list_add_tail(&msg->link, msgq);
 
 		/* adjust the jumbo packet */
diff --git a/sound/pci/echoaudio/layla24_dsp.c b/sound/pci/echoaudio/layla24_dsp.c
index 7ec5b63d0dce..97e42e115147 100644
--- a/sound/pci/echoaudio/layla24_dsp.c
+++ b/sound/pci/echoaudio/layla24_dsp.c
@@ -302,11 +302,11 @@ static int switch_asic(struct echoaudio *chip, const struct firmware *asic)
 
 	/*  Check to see if this is already loaded */
 	if (asic != chip->asic_code) {
-		monitors = kmalloc(MONITOR_ARRAY_SIZE, GFP_KERNEL);
+		monitors = kmemdup(chip->comm_page->monitors,
+					MONITOR_ARRAY_SIZE, GFP_KERNEL);
 		if (! monitors)
 			return -ENOMEM;
 
-		memcpy(monitors, chip->comm_page->monitors, MONITOR_ARRAY_SIZE);
 		memset(chip->comm_page->monitors, ECHOGAIN_MUTED,
 		       MONITOR_ARRAY_SIZE);
 
diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c
index 49248fa7aef4..a42acf6d7b68 100644
--- a/sound/usb/usbaudio.c
+++ b/sound/usb/usbaudio.c
@@ -2046,10 +2046,9 @@ int snd_usb_ctl_msg(struct usb_device *dev, unsigned int pipe, __u8 request,
 	void *buf = NULL;
 
 	if (size > 0) {
-		buf = kmalloc(size, GFP_KERNEL);
+		buf = kmemdup(data, size, GFP_KERNEL);
 		if (!buf)
 			return -ENOMEM;
-		memcpy(buf, data, size);
 	}
 	err = usb_control_msg(dev, pipe, request, requesttype,
 			      value, index, buf, size, timeout);
@@ -2846,12 +2845,11 @@ static int create_fixed_stream_quirk(struct snd_usb_audio *chip,
 	int stream, err;
 	int *rate_table = NULL;
 
-	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	fp = kmemdup(quirk->data, sizeof(*fp), GFP_KERNEL);
 	if (! fp) {
-		snd_printk(KERN_ERR "cannot malloc\n");
+		snd_printk(KERN_ERR "cannot memdup\n");
 		return -ENOMEM;
 	}
-	memcpy(fp, quirk->data, sizeof(*fp));
 	if (fp->nr_rates > 0) {
 		rate_table = kmalloc(sizeof(int) * fp->nr_rates, GFP_KERNEL);
 		if (!rate_table) {
@@ -3029,10 +3027,9 @@ static int create_ua1000_quirk(struct snd_usb_audio *chip,
 	    altsd->bNumEndpoints != 1)
 		return -ENXIO;
 
-	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	fp = kmemdup(&ua1000_format, sizeof(*fp), GFP_KERNEL);
 	if (!fp)
 		return -ENOMEM;
-	memcpy(fp, &ua1000_format, sizeof(*fp));
 
 	fp->channels = alts->extra[4];
 	fp->iface = altsd->bInterfaceNumber;
diff --git a/sound/usb/usbmidi.c b/sound/usb/usbmidi.c
index abe29dadd979..0dcf78adb99a 100644
--- a/sound/usb/usbmidi.c
+++ b/sound/usb/usbmidi.c
@@ -323,10 +323,9 @@ static int send_bulk_static_data(struct snd_usb_midi_out_endpoint* ep,
 				 const void *data, int len)
 {
 	int err;
-	void *buf = kmalloc(len, GFP_KERNEL);
+	void *buf = kmemdup(data, len, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
-	memcpy(buf, data, len);
 	dump_urb("sending", buf, len);
 	err = usb_bulk_msg(ep->umidi->chip->dev, ep->urb->pipe, buf, len,
 			   NULL, 250);
-- 
cgit v1.2.3


From 027445c37282bc1ed26add45e573ad2d3e4860a5 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:46 -0700
Subject: [PATCH] Vectorize aio_read/aio_write fileop methods

This patch vectorizes aio_read() and aio_write() methods to prepare for
collapsing all aio & vectored operations into one interface - which is
aio_read()/aio_write().

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Michael Holzheu <HOLZHEU@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/Locking |  5 +--
 Documentation/filesystems/vfs.txt |  4 +-
 arch/s390/hypfs/inode.c           | 17 +++++++--
 drivers/char/raw.c                | 14 +------
 drivers/usb/gadget/inode.c        | 80 +++++++++++++++++++++++++++------------
 fs/aio.c                          | 15 ++++++--
 fs/block_dev.c                    | 10 +----
 fs/cifs/cifsfs.c                  |  6 +--
 fs/ext3/file.c                    |  5 ++-
 fs/nfs/direct.c                   | 26 ++++++++++---
 fs/nfs/file.c                     | 34 +++++++++--------
 fs/ntfs/file.c                    |  8 ++--
 fs/ocfs2/file.c                   | 28 +++++++-------
 fs/read_write.c                   | 20 ++++++++--
 fs/reiserfs/file.c                |  4 +-
 fs/xfs/linux-2.6/xfs_file.c       | 46 +++++++++++-----------
 include/linux/aio.h               |  2 +
 include/linux/fs.h                | 10 ++---
 include/linux/nfs_fs.h            | 10 +++--
 include/net/sock.h                |  1 -
 mm/filemap.c                      | 39 +++++++++----------
 net/socket.c                      | 49 +++++++++++-------------
 22 files changed, 242 insertions(+), 191 deletions(-)

(limited to 'mm')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 247d7f619aa2..eb1a6cad21e6 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -356,10 +356,9 @@ The last two are called only from check_disk_change().
 prototypes:
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
-	ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t,
-			loff_t);
+	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int,
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 1cb7e8be927a..cd07c21b8400 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -699,9 +699,9 @@ This describes how the VFS can manipulate an open file. As of kernel
 struct file_operations {
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
-	ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t);
+	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 813fc21358f9..cd702ae45d6d 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -134,12 +134,20 @@ static int hypfs_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static ssize_t hypfs_aio_read(struct kiocb *iocb, __user char *buf,
-			      size_t count, loff_t offset)
+static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t offset)
 {
 	char *data;
 	size_t len;
 	struct file *filp = iocb->ki_filp;
+	/* XXX: temporary */
+	char __user *buf = iov[0].iov_base;
+	size_t count = iov[0].iov_len;
+
+	if (nr_segs != 1) {
+		count = -EINVAL;
+		goto out;
+	}
 
 	data = filp->private_data;
 	len = strlen(data);
@@ -158,12 +166,13 @@ static ssize_t hypfs_aio_read(struct kiocb *iocb, __user char *buf,
 out:
 	return count;
 }
-static ssize_t hypfs_aio_write(struct kiocb *iocb, const char __user *buf,
-			       size_t count, loff_t pos)
+static ssize_t hypfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t offset)
 {
 	int rc;
 	struct super_block *sb;
 	struct hypfs_sb_info *fs_info;
+	size_t count = iov_length(iov, nr_segs);
 
 	sb = iocb->ki_filp->f_dentry->d_inode->i_sb;
 	fs_info = sb->s_fs_info;
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index c596a08c07b3..173fb08555d5 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -249,23 +249,11 @@ static ssize_t raw_file_write(struct file *file, const char __user *buf,
 	return generic_file_write_nolock(file, &local_iov, 1, ppos);
 }
 
-static ssize_t raw_file_aio_write(struct kiocb *iocb, const char __user *buf,
-					size_t count, loff_t pos)
-{
-	struct iovec local_iov = {
-		.iov_base = (char __user *)buf,
-		.iov_len = count
-	};
-
-	return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
-}
-
-
 static const struct file_operations raw_fops = {
 	.read	=	generic_file_read,
 	.aio_read = 	generic_file_aio_read,
 	.write	=	raw_file_write,
-	.aio_write = 	raw_file_aio_write,
+	.aio_write = 	generic_file_aio_write_nolock,
 	.open	=	raw_open,
 	.release=	raw_release,
 	.ioctl	=	raw_ioctl,
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 4a000d846a93..86924f9cdd7e 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -533,7 +533,8 @@ struct kiocb_priv {
 	struct usb_request	*req;
 	struct ep_data		*epdata;
 	void			*buf;
-	char __user		*ubuf;		/* NULL for writes */
+	const struct iovec	*iv;
+	unsigned long		nr_segs;
 	unsigned		actual;
 };
 
@@ -561,17 +562,32 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
 static ssize_t ep_aio_read_retry(struct kiocb *iocb)
 {
 	struct kiocb_priv	*priv = iocb->private;
-	ssize_t			status = priv->actual;
-
-	/* we "retry" to get the right mm context for this: */
-	status = copy_to_user(priv->ubuf, priv->buf, priv->actual);
-	if (unlikely(0 != status))
-		status = -EFAULT;
-	else
-		status = priv->actual;
-	kfree(priv->buf);
-	kfree(priv);
-	return status;
+	ssize_t			len, total;
+	int			i;
+
+  	/* we "retry" to get the right mm context for this: */
+
+ 	/* copy stuff into user buffers */
+ 	total = priv->actual;
+ 	len = 0;
+ 	for (i=0; i < priv->nr_segs; i++) {
+ 		ssize_t this = min((ssize_t)(priv->iv[i].iov_len), total);
+
+ 		if (copy_to_user(priv->iv[i].iov_base, priv->buf, this)) {
+ 			if (len == 0)
+ 				len = -EFAULT;
+ 			break;
+ 		}
+
+ 		total -= this;
+ 		len += this;
+ 		if (total == 0)
+ 			break;
+ 	}
+  	kfree(priv->buf);
+  	kfree(priv);
+  	aio_put_req(iocb);
+ 	return len;
 }
 
 static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
@@ -584,7 +600,7 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req)
 	spin_lock(&epdata->dev->lock);
 	priv->req = NULL;
 	priv->epdata = NULL;
-	if (priv->ubuf == NULL
+	if (priv->iv == NULL
 			|| unlikely(req->actual == 0)
 			|| unlikely(kiocbIsCancelled(iocb))) {
 		kfree(req->buf);
@@ -619,7 +635,8 @@ ep_aio_rwtail(
 	char		*buf,
 	size_t		len,
 	struct ep_data	*epdata,
-	char __user	*ubuf
+	const struct iovec *iv,
+	unsigned long 	nr_segs
 )
 {
 	struct kiocb_priv	*priv;
@@ -634,7 +651,8 @@ fail:
 		return value;
 	}
 	iocb->private = priv;
-	priv->ubuf = ubuf;
+	priv->iv = iv;
+	priv->nr_segs = nr_segs;
 
 	value = get_ready_ep(iocb->ki_filp->f_flags, epdata);
 	if (unlikely(value < 0)) {
@@ -674,41 +692,53 @@ fail:
 		kfree(priv);
 		put_ep(epdata);
 	} else
-		value = (ubuf ? -EIOCBRETRY : -EIOCBQUEUED);
+		value = (iv ? -EIOCBRETRY : -EIOCBQUEUED);
 	return value;
 }
 
 static ssize_t
-ep_aio_read(struct kiocb *iocb, char __user *ubuf, size_t len, loff_t o)
+ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t o)
 {
 	struct ep_data		*epdata = iocb->ki_filp->private_data;
 	char			*buf;
 
 	if (unlikely(epdata->desc.bEndpointAddress & USB_DIR_IN))
 		return -EINVAL;
-	buf = kmalloc(len, GFP_KERNEL);
+
+	buf = kmalloc(iocb->ki_left, GFP_KERNEL);
 	if (unlikely(!buf))
 		return -ENOMEM;
+
 	iocb->ki_retry = ep_aio_read_retry;
-	return ep_aio_rwtail(iocb, buf, len, epdata, ubuf);
+	return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs);
 }
 
 static ssize_t
-ep_aio_write(struct kiocb *iocb, const char __user *ubuf, size_t len, loff_t o)
+ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t o)
 {
 	struct ep_data		*epdata = iocb->ki_filp->private_data;
 	char			*buf;
+	size_t			len = 0;
+	int			i = 0;
 
 	if (unlikely(!(epdata->desc.bEndpointAddress & USB_DIR_IN)))
 		return -EINVAL;
-	buf = kmalloc(len, GFP_KERNEL);
+
+	buf = kmalloc(iocb->ki_left, GFP_KERNEL);
 	if (unlikely(!buf))
 		return -ENOMEM;
-	if (unlikely(copy_from_user(buf, ubuf, len) != 0)) {
-		kfree(buf);
-		return -EFAULT;
+
+	for (i=0; i < nr_segs; i++) {
+		if (unlikely(copy_from_user(&buf[len], iov[i].iov_base,
+				iov[i].iov_len) != 0)) {
+			kfree(buf);
+			return -EFAULT;
+		}
+		len += iov[i].iov_len;
 	}
-	return ep_aio_rwtail(iocb, buf, len, epdata, NULL);
+	return ep_aio_rwtail(iocb, buf, len, epdata, NULL, 0);
 }
 
 /*----------------------------------------------------------------------*/
diff --git a/fs/aio.c b/fs/aio.c
index 950630187acc..27ff56540c73 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/uio.h>
 
 #define DEBUG 0
 
@@ -1315,8 +1316,11 @@ static ssize_t aio_pread(struct kiocb *iocb)
 	ssize_t ret = 0;
 
 	do {
-		ret = file->f_op->aio_read(iocb, iocb->ki_buf,
-			iocb->ki_left, iocb->ki_pos);
+		iocb->ki_inline_vec.iov_base = iocb->ki_buf;
+		iocb->ki_inline_vec.iov_len = iocb->ki_left;
+
+		ret = file->f_op->aio_read(iocb, &iocb->ki_inline_vec,
+						1, iocb->ki_pos);
 		/*
 		 * Can't just depend on iocb->ki_left to determine
 		 * whether we are done. This may have been a short read.
@@ -1349,8 +1353,11 @@ static ssize_t aio_pwrite(struct kiocb *iocb)
 	ssize_t ret = 0;
 
 	do {
-		ret = file->f_op->aio_write(iocb, iocb->ki_buf,
-			iocb->ki_left, iocb->ki_pos);
+		iocb->ki_inline_vec.iov_base = iocb->ki_buf;
+		iocb->ki_inline_vec.iov_len = iocb->ki_left;
+
+		ret = file->f_op->aio_write(iocb, &iocb->ki_inline_vec,
+						1, iocb->ki_pos);
 		if (ret > 0) {
 			iocb->ki_buf += ret;
 			iocb->ki_left -= ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0c361ea7e5a6..8c819117310b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1162,14 +1162,6 @@ static ssize_t blkdev_file_write(struct file *file, const char __user *buf,
 	return generic_file_write_nolock(file, &local_iov, 1, ppos);
 }
 
-static ssize_t blkdev_file_aio_write(struct kiocb *iocb, const char __user *buf,
-				   size_t count, loff_t pos)
-{
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
-
-	return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
-}
-
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
@@ -1192,7 +1184,7 @@ const struct file_operations def_blk_fops = {
 	.read		= generic_file_read,
 	.write		= blkdev_file_write,
   	.aio_read	= generic_file_aio_read,
-  	.aio_write	= blkdev_file_aio_write, 
+  	.aio_write	= generic_file_aio_write_nolock,
 	.mmap		= generic_file_mmap,
 	.fsync		= block_fsync,
 	.unlocked_ioctl	= block_ioctl,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22bcf4d7e7ae..5abb42a7c53e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -492,13 +492,13 @@ static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
 	return written;
 }
 
-static ssize_t cifs_file_aio_write(struct kiocb *iocb, const char __user *buf,
-				   size_t count, loff_t pos)
+static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
 {
 	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
 	ssize_t written;
 
-	written = generic_file_aio_write(iocb, buf, count, pos);
+	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	if (!CIFS_I(inode)->clientCanCacheAll)
 		filemap_fdatawrite(inode->i_mapping);
 	return written;
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 74ff20f9d09b..5c762457bc89 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -48,14 +48,15 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 }
 
 static ssize_t
-ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
+ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
 	ssize_t ret;
 	int err;
 
-	ret = generic_file_aio_write(iocb, buf, count, pos);
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
 
 	/*
 	 * Skip flushing if there was an error, or if nothing was written.
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 377839bed172..9f7f8b9ea1e2 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -707,8 +707,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 /**
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
- * @buf: user's buffer into which to read data
- * @count: number of bytes to read
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
  * @pos: byte offset in file where reading starts
  *
  * We use this function for direct reads instead of calling
@@ -725,17 +725,24 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
+	/* XXX: temporary */
+	const char __user *buf = iov[0].iov_base;
+	size_t count = iov[0].iov_len;
 
 	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
 		file->f_dentry->d_parent->d_name.name,
 		file->f_dentry->d_name.name,
 		(unsigned long) count, (long long) pos);
 
+	if (nr_segs != 1)
+		return -EINVAL;
+
 	if (count < 0)
 		goto out;
 	retval = -EFAULT;
@@ -760,8 +767,8 @@ out:
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
- * @buf: user's buffer from which to write data
- * @count: number of bytes to write
+ * @iov: vector of user buffers from which to write data
+ * @nr_segs: size of iov vector
  * @pos: byte offset in file where writing starts
  *
  * We use this function for direct writes instead of calling
@@ -782,17 +789,24 @@ out:
  * Note that O_APPEND is not supported for NFS direct writes, as there
  * is no atomic O_APPEND write facility in the NFS protocol.
  */
-ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	ssize_t retval;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
+	/* XXX: temporary */
+	const char __user *buf = iov[0].iov_base;
+	size_t count = iov[0].iov_len;
 
 	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
 		file->f_dentry->d_parent->d_name.name,
 		file->f_dentry->d_name.name,
 		(unsigned long) count, (long long) pos);
 
+	if (nr_segs != 1)
+		return -EINVAL;
+
 	retval = generic_write_checks(file, &pos, &count, 0);
 	if (retval)
 		goto out;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index be997d649127..cc93865cea93 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -41,8 +41,10 @@ static int nfs_file_release(struct inode *, struct file *);
 static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
 static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
-static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
-static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
+static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
+static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
@@ -53,8 +55,8 @@ const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.aio_read		= nfs_file_read,
-	.aio_write		= nfs_file_write,
+	.aio_read	= nfs_file_read,
+	.aio_write	= nfs_file_write,
 	.mmap		= nfs_file_mmap,
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
@@ -196,15 +198,17 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 }
 
 static ssize_t
-nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
+nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct dentry * dentry = iocb->ki_filp->f_dentry;
 	struct inode * inode = dentry->d_inode;
 	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
 
 #ifdef CONFIG_NFS_DIRECTIO
 	if (iocb->ki_filp->f_flags & O_DIRECT)
-		return nfs_file_direct_read(iocb, buf, count, pos);
+		return nfs_file_direct_read(iocb, iov, nr_segs, pos);
 #endif
 
 	dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
@@ -214,7 +218,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
 	if (!result)
-		result = generic_file_aio_read(iocb, buf, count, pos);
+		result = generic_file_aio_read(iocb, iov, nr_segs, pos);
 	return result;
 }
 
@@ -336,24 +340,22 @@ const struct address_space_operations nfs_file_aops = {
 #endif
 };
 
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
+static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	struct dentry * dentry = iocb->ki_filp->f_dentry;
 	struct inode * inode = dentry->d_inode;
 	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
 
 #ifdef CONFIG_NFS_DIRECTIO
 	if (iocb->ki_filp->f_flags & O_DIRECT)
-		return nfs_file_direct_write(iocb, buf, count, pos);
+		return nfs_file_direct_write(iocb, iov, nr_segs, pos);
 #endif
 
-	dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%lu)\n",
+	dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		inode->i_ino, (unsigned long) count, (unsigned long) pos);
+		inode->i_ino, (unsigned long) count, (long long) pos);
 
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
@@ -372,7 +374,7 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		goto out;
 
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
-	result = generic_file_aio_write(iocb, buf, count, pos);
+	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 out:
 	return result;
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 585a79d39c9d..0c46f5c86b71 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2176,20 +2176,18 @@ out:
 /**
  * ntfs_file_aio_write -
  */
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
-		size_t count, loff_t pos)
+static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
-	ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = sync_page_range(inode, mapping, pos, ret);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2bbfa17090cf..d9ba0a931a03 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -961,25 +961,23 @@ static inline int ocfs2_write_should_remove_suid(struct inode *inode)
 }
 
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
-				    const char __user *buf,
-				    size_t count,
+				    const struct iovec *iov,
+				    unsigned long nr_segs,
 				    loff_t pos)
 {
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
 	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
 	u32 clusters;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	loff_t newsize, saved_pos;
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
+	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+		   (unsigned int)nr_segs,
 		   filp->f_dentry->d_name.len,
 		   filp->f_dentry->d_name.name);
 
 	/* happy write of zero bytes */
-	if (count == 0)
+	if (iocb->ki_left == 0)
 		return 0;
 
 	if (!inode) {
@@ -1048,7 +1046,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		} else {
 			saved_pos = iocb->ki_pos;
 		}
-		newsize = count + saved_pos;
+		newsize = iocb->ki_left + saved_pos;
 
 		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
 		     (long long) saved_pos, (long long) newsize,
@@ -1081,7 +1079,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		if (!clusters)
 			break;
 
-		ret = ocfs2_extend_file(inode, NULL, newsize, count);
+		ret = ocfs2_extend_file(inode, NULL, newsize, iocb->ki_left);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
@@ -1098,7 +1096,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb);
 
-	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -1132,16 +1130,16 @@ out:
 }
 
 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
-				   char __user *buf,
-				   size_t count,
+				   const struct iovec *iov,
+				   unsigned long nr_segs,
 				   loff_t pos)
 {
 	int ret = 0, rw_level = -1, have_alloc_sem = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
+	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+		   (unsigned int)nr_segs,
 		   filp->f_dentry->d_name.len,
 		   filp->f_dentry->d_name.name);
 
@@ -1185,7 +1183,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	}
 	ocfs2_meta_unlock(inode, 0);
 
-	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
 		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
 
diff --git a/fs/read_write.c b/fs/read_write.c
index d4cb3183c99c..679dd535380f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -227,14 +227,20 @@ static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
 
 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
+	struct iovec iov = { .iov_base = buf, .iov_len = len };
 	struct kiocb kiocb;
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	while (-EIOCBRETRY ==
-		(ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos)))
+	kiocb.ki_left = len;
+
+	for (;;) {
+		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
 		wait_on_retry_sync_kiocb(&kiocb);
+	}
 
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
@@ -279,14 +285,20 @@ EXPORT_SYMBOL(vfs_read);
 
 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 {
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 	struct kiocb kiocb;
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	while (-EIOCBRETRY ==
-	       (ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos)))
+	kiocb.ki_left = len;
+
+	for (;;) {
+		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
 		wait_on_retry_sync_kiocb(&kiocb);
+	}
 
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index c11f6118c9ca..41f24369e47a 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1334,7 +1334,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 			if (err)
 				return err;
 		}
-		result = generic_file_write(file, buf, count, ppos);
+		result = do_sync_write(file, buf, count, ppos);
 
 		if (after_file_end) {	/* Now update i_size and remove the savelink */
 			struct reiserfs_transaction_handle th;
@@ -1566,7 +1566,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 }
 
 const struct file_operations reiserfs_file_operations = {
-	.read = generic_file_read,
+	.read = do_sync_read,
 	.write = reiserfs_file_write,
 	.ioctl = reiserfs_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 41cfcba7ce49..4737971c6a39 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -49,50 +49,49 @@ static struct vm_operations_struct xfs_dmapi_file_vm_ops;
 STATIC inline ssize_t
 __xfs_file_read(
 	struct kiocb		*iocb,
-	char			__user *buf,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	int			ioflags,
-	size_t			count,
 	loff_t			pos)
 {
-	struct iovec		iov = {buf, count};
 	struct file		*file = iocb->ki_filp;
 	bhv_vnode_t		*vp = vn_from_inode(file->f_dentry->d_inode);
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
+	return bhv_vop_read(vp, iocb, iov, nr_segs, &iocb->ki_pos,
+				ioflags, NULL);
 }
 
 STATIC ssize_t
 xfs_file_aio_read(
 	struct kiocb		*iocb,
-	char			__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_read(iocb, buf, IO_ISAIO, count, pos);
+	return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
 }
 
 STATIC ssize_t
 xfs_file_aio_read_invis(
 	struct kiocb		*iocb,
-	char			__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
+	return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
 }
 
 STATIC inline ssize_t
 __xfs_file_write(
-	struct kiocb	*iocb,
-	const char	__user *buf,
-	int		ioflags,
-	size_t		count,
-	loff_t		pos)
+	struct kiocb		*iocb,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
+	int			ioflags,
+	loff_t			pos)
 {
-	struct iovec	iov = {(void __user *)buf, count};
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
@@ -100,27 +99,28 @@ __xfs_file_write(
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
+	return bhv_vop_write(vp, iocb, iov, nr_segs, &iocb->ki_pos,
+				ioflags, NULL);
 }
 
 STATIC ssize_t
 xfs_file_aio_write(
 	struct kiocb		*iocb,
-	const char		__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_write(iocb, buf, IO_ISAIO, count, pos);
+	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
 }
 
 STATIC ssize_t
 xfs_file_aio_write_invis(
 	struct kiocb		*iocb,
-	const char		__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
+	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
 }
 
 STATIC inline ssize_t
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 8a0193385a9b..58349e58b749 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -4,6 +4,7 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/aio_abi.h>
+#include <linux/uio.h>
 
 #include <asm/atomic.h>
 
@@ -112,6 +113,7 @@ struct kiocb {
 	long			ki_retried; 	/* just for testing */
 	long			ki_kicked; 	/* just for testing */
 	long			ki_queued; 	/* just for testing */
+	struct iovec		ki_inline_vec;	/* inline vector */
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5baf3a153403..257bae16f545 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1097,9 +1097,9 @@ struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
-	ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t);
+	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
@@ -1704,11 +1704,11 @@ extern int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned
 extern ssize_t generic_file_read(struct file *, char __user *, size_t, loff_t *);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *);
-extern ssize_t generic_file_aio_read(struct kiocb *, char __user *, size_t, loff_t);
+extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *);
-extern ssize_t generic_file_aio_write(struct kiocb *, const char __user *, size_t, loff_t);
+extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
-		unsigned long, loff_t *);
+		unsigned long, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 98c9b9f667a5..76ff54846ada 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -367,10 +367,12 @@ extern int nfs3_removexattr (struct dentry *, const char *name);
  */
 extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t,
 			unsigned long);
-extern ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf,
-			size_t count, loff_t pos);
-extern ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf,
-			size_t count, loff_t pos);
+extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
+			const struct iovec *iov, unsigned long nr_segs,
+			loff_t pos);
+extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
+			const struct iovec *iov, unsigned long nr_segs,
+			loff_t pos);
 
 /*
  * linux/fs/nfs/dir.c
diff --git a/include/net/sock.h b/include/net/sock.h
index edd4d73ce7f5..40bb90ebb2d1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -665,7 +665,6 @@ struct sock_iocb {
 	struct sock		*sk;
 	struct scm_cookie	*scm;
 	struct msghdr		*msg, async_msg;
-	struct iovec		async_iov;
 	struct kiocb		*kiocb;
 };
 
diff --git a/mm/filemap.c b/mm/filemap.c
index c4fe97f5ace0..f6c1d22b504f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1226,12 +1226,11 @@ out:
 EXPORT_SYMBOL(__generic_file_aio_read);
 
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
-	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-
 	BUG_ON(iocb->ki_pos != pos);
-	return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
+	return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos);
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 
@@ -2315,22 +2314,22 @@ out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
-EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
-ssize_t
-generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
+ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
+		const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
-	loff_t pos = *ppos;
 
-	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos);
+	BUG_ON(iocb->ki_pos != pos);
+
+	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+			&iocb->ki_pos);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err;
+		ssize_t err;
 
 		err = sync_page_range_nolock(inode, mapping, pos, ret);
 		if (err < 0)
@@ -2338,6 +2337,7 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 	}
 	return ret;
 }
+EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
 static ssize_t
 __generic_file_write_nolock(struct file *file, const struct iovec *iov,
@@ -2347,8 +2347,9 @@ __generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *ppos;
 	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
-	if (ret == -EIOCBQUEUED)
+	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	return ret;
 }
@@ -2361,28 +2362,28 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, file);
-	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+	kiocb.ki_pos = *ppos;
+	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, *ppos);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_write_nolock);
 
-ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
-			       size_t count, loff_t pos)
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-					.iov_len = count };
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
-						&iocb->ki_pos);
+	ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
+			&iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
diff --git a/net/socket.c b/net/socket.c
index 1bc4167e0da8..df92e4252749 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -95,10 +95,10 @@
 #include <linux/netfilter.h>
 
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
-static ssize_t sock_aio_read(struct kiocb *iocb, char __user *buf,
-			     size_t size, loff_t pos);
-static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *buf,
-			      size_t size, loff_t pos);
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t pos);
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
 static int sock_close(struct inode *inode, struct file *file);
@@ -664,7 +664,6 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
 }
 
 static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
-					 char __user *ubuf, size_t size,
 					 struct sock_iocb *siocb)
 {
 	if (!is_sync_kiocb(iocb)) {
@@ -675,16 +674,13 @@ static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
 	}
 
 	siocb->kiocb = iocb;
-	siocb->async_iov.iov_base = ubuf;
-	siocb->async_iov.iov_len = size;
-
 	iocb->private = siocb;
 	return siocb;
 }
 
 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
-			    struct file *file, struct iovec *iov,
-			    unsigned long nr_segs)
+		struct file *file, const struct iovec *iov,
+		unsigned long nr_segs)
 {
 	struct socket *sock = file->private_data;
 	size_t size = 0;
@@ -715,32 +711,33 @@ static ssize_t sock_readv(struct file *file, const struct iovec *iov,
 	init_sync_kiocb(&iocb, NULL);
 	iocb.private = &siocb;
 
-	ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+	ret = do_sock_read(&msg, &iocb, file, iov, nr_segs);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&iocb);
 	return ret;
 }
 
-static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf,
-			     size_t count, loff_t pos)
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	struct sock_iocb siocb, *x;
 
 	if (pos != 0)
 		return -ESPIPE;
-	if (count == 0)		/* Match SYS5 behaviour */
+
+	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
 		return 0;
 
-	x = alloc_sock_iocb(iocb, ubuf, count, &siocb);
+
+	x = alloc_sock_iocb(iocb, &siocb);
 	if (!x)
 		return -ENOMEM;
-	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp,
-			    &x->async_iov, 1);
+	return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 }
 
 static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
-			     struct file *file, struct iovec *iov,
-			     unsigned long nr_segs)
+			struct file *file, const struct iovec *iov,
+			unsigned long nr_segs)
 {
 	struct socket *sock = file->private_data;
 	size_t size = 0;
@@ -773,28 +770,28 @@ static ssize_t sock_writev(struct file *file, const struct iovec *iov,
 	init_sync_kiocb(&iocb, NULL);
 	iocb.private = &siocb;
 
-	ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs);
+	ret = do_sock_write(&msg, &iocb, file, iov, nr_segs);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&iocb);
 	return ret;
 }
 
-static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf,
-			      size_t count, loff_t pos)
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			  unsigned long nr_segs, loff_t pos)
 {
 	struct sock_iocb siocb, *x;
 
 	if (pos != 0)
 		return -ESPIPE;
-	if (count == 0)		/* Match SYS5 behaviour */
+
+	if (iocb->ki_left == 0)	/* Match SYS5 behaviour */
 		return 0;
 
-	x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb);
+	x = alloc_sock_iocb(iocb, &siocb);
 	if (!x)
 		return -ENOMEM;
 
-	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp,
-			     &x->async_iov, 1);
+	return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, iov, nr_segs);
 }
 
 /*
-- 
cgit v1.2.3


From ee0b3e671baff681d69fbf0db33b47603c0a8280 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:47 -0700
Subject: [PATCH] Remove readv/writev methods and use aio_read/aio_write
 instead

This patch removes readv() and writev() methods and replaces them with
aio_read()/aio_write() methods.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/raw.c          |   2 -
 drivers/net/tun.c           |  37 +++++-----------
 fs/bad_inode.c              |   2 -
 fs/block_dev.c              |   2 -
 fs/cifs/cifsfs.c            |  16 -------
 fs/compat.c                 |  44 +++++--------------
 fs/ext2/file.c              |   2 -
 fs/ext3/file.c              |   2 -
 fs/fat/file.c               |   2 -
 fs/fuse/dev.c               |  37 +++++-----------
 fs/hostfs/hostfs_kern.c     |   2 -
 fs/jfs/file.c               |   2 -
 fs/ntfs/file.c              |   2 -
 fs/pipe.c                   |  59 ++++++++++----------------
 fs/read_write.c             | 101 +++++++++++++++++++++++++++++---------------
 fs/read_write.h             |  14 ++++++
 fs/xfs/linux-2.6/xfs_file.c |  94 -----------------------------------------
 include/linux/fs.h          |   6 ---
 mm/filemap.c                |  36 ----------------
 net/socket.c                |  40 ------------------
 sound/core/pcm_native.c     |  40 +++++++++---------
 21 files changed, 154 insertions(+), 388 deletions(-)
 create mode 100644 fs/read_write.h

(limited to 'mm')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 173fb08555d5..490db531e2d8 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -257,8 +257,6 @@ static const struct file_operations raw_fops = {
 	.open	=	raw_open,
 	.release=	raw_release,
 	.ioctl	=	raw_ioctl,
-	.readv	= 	generic_file_readv,
-	.writev	= 	generic_file_writev,
 	.owner	=	THIS_MODULE,
 };
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index de8da6dee1b0..bc0face01d25 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -288,11 +288,10 @@ static inline size_t iov_total(const struct iovec *iv, unsigned long count)
 	return len;
 }
 
-/* Writev */
-static ssize_t tun_chr_writev(struct file * file, const struct iovec *iv,
-			      unsigned long count, loff_t *pos)
+static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
+			      unsigned long count, loff_t pos)
 {
-	struct tun_struct *tun = file->private_data;
+	struct tun_struct *tun = iocb->ki_filp->private_data;
 
 	if (!tun)
 		return -EBADFD;
@@ -302,14 +301,6 @@ static ssize_t tun_chr_writev(struct file * file, const struct iovec *iv,
 	return tun_get_user(tun, (struct iovec *) iv, iov_total(iv, count));
 }
 
-/* Write */
-static ssize_t tun_chr_write(struct file * file, const char __user * buf,
-			     size_t count, loff_t *pos)
-{
-	struct iovec iv = { (void __user *) buf, count };
-	return tun_chr_writev(file, &iv, 1, pos);
-}
-
 /* Put packet to the user space buffer */
 static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 				       struct sk_buff *skb,
@@ -343,10 +334,10 @@ static __inline__ ssize_t tun_put_user(struct tun_struct *tun,
 	return total;
 }
 
-/* Readv */
-static ssize_t tun_chr_readv(struct file *file, const struct iovec *iv,
-			    unsigned long count, loff_t *pos)
+static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
+			    unsigned long count, loff_t pos)
 {
+	struct file *file = iocb->ki_filp;
 	struct tun_struct *tun = file->private_data;
 	DECLARE_WAITQUEUE(wait, current);
 	struct sk_buff *skb;
@@ -426,14 +417,6 @@ static ssize_t tun_chr_readv(struct file *file, const struct iovec *iv,
 	return ret;
 }
 
-/* Read */
-static ssize_t tun_chr_read(struct file * file, char __user * buf,
-			    size_t count, loff_t *pos)
-{
-	struct iovec iv = { buf, count };
-	return tun_chr_readv(file, &iv, 1, pos);
-}
-
 static void tun_setup(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -764,10 +747,10 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 static struct file_operations tun_fops = {
 	.owner	= THIS_MODULE,
 	.llseek = no_llseek,
-	.read	= tun_chr_read,
-	.readv	= tun_chr_readv,
-	.write	= tun_chr_write,
-	.writev = tun_chr_writev,
+	.read  = do_sync_read,
+	.aio_read  = tun_chr_aio_read,
+	.write = do_sync_write,
+	.aio_write = tun_chr_aio_write,
 	.poll	= tun_chr_poll,
 	.ioctl	= tun_chr_ioctl,
 	.open	= tun_chr_open,
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 80599ae33966..34e6d7b220c3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -40,8 +40,6 @@ static const struct file_operations bad_file_ops =
 	.aio_fsync	= EIO_ERROR,
 	.fasync		= EIO_ERROR,
 	.lock		= EIO_ERROR,
-	.readv		= EIO_ERROR,
-	.writev		= EIO_ERROR,
 	.sendfile	= EIO_ERROR,
 	.sendpage	= EIO_ERROR,
 	.get_unmapped_area = EIO_ERROR,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 8c819117310b..0f143094ef1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1191,8 +1191,6 @@ const struct file_operations def_blk_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.readv		= generic_file_readv,
-	.writev		= generic_file_write_nolock,
 	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5abb42a7c53e..c00c654f2e11 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -480,18 +480,6 @@ cifs_get_sb(struct file_system_type *fs_type,
 	return simple_set_mnt(mnt, sb);
 }
 
-static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	ssize_t written;
-
-	written = generic_file_writev(file, iov, nr_segs, ppos);
-	if (!CIFS_I(inode)->clientCanCacheAll)
-		filemap_fdatawrite(inode->i_mapping);
-	return written;
-}
-
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				   unsigned long nr_segs, loff_t pos)
 {
@@ -577,8 +565,6 @@ struct inode_operations cifs_symlink_inode_ops = {
 const struct file_operations cifs_file_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
-	.readv = generic_file_readv,
-	.writev = cifs_file_writev,
 	.aio_read = generic_file_aio_read,
 	.aio_write = cifs_file_aio_write,
 	.open = cifs_open,
@@ -620,8 +606,6 @@ const struct file_operations cifs_file_direct_ops = {
 const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
-	.readv = generic_file_readv,
-	.writev = cifs_file_writev,
 	.aio_read = generic_file_aio_read,
 	.aio_write = cifs_file_aio_write,
 	.open = cifs_open,
diff --git a/fs/compat.c b/fs/compat.c
index 122b4e3992b5..6b90bf35f61d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -70,6 +70,8 @@ int compat_printk(const char *fmt, ...)
 	return ret;
 }
 
+#include "read_write.h"
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -1149,9 +1151,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 			       const struct compat_iovec __user *uvector,
 			       unsigned long nr_segs, loff_t *pos)
 {
-	typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-	typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *);
-
 	compat_ssize_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov=iovstack, *vector;
@@ -1234,39 +1233,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
-		fnv = file->f_op->readv;
+		fnv = file->f_op->aio_read;
 	} else {
 		fn = (io_fn_t)file->f_op->write;
-		fnv = file->f_op->writev;
-	}
-	if (fnv) {
-		ret = fnv(file, iov, nr_segs, pos);
-		goto out;
+		fnv = file->f_op->aio_write;
 	}
 
-	/* Do it by hand, with file-ops */
-	ret = 0;
-	vector = iov;
-	while (nr_segs > 0) {
-		void __user * base;
-		size_t len;
-		ssize_t nr;
-
-		base = vector->iov_base;
-		len = vector->iov_len;
-		vector++;
-		nr_segs--;
-
-		nr = fn(file, base, len, pos);
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
-		if (nr < 0) {
-			if (!ret) ret = nr;
-			break;
-		}
-		ret += nr;
-		if (nr != len)
-			break;
-	}
 out:
 	if (iov != iovstack)
 		kfree(iov);
@@ -1294,7 +1272,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->readv && !file->f_op->read))
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 		goto out;
 
 	ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
@@ -1317,7 +1295,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->writev && !file->f_op->write))
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 		goto out;
 
 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index e8bbed9dd268..e893e2be9ed4 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -53,8 +53,6 @@ const struct file_operations ext2_file_operations = {
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_sync_file,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5c762457bc89..e96c388047e0 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -112,8 +112,6 @@ const struct file_operations ext3_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext3_file_write,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.ioctl		= ext3_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext3_compat_ioctl,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index d50fc47169c1..f4b8f8b3fbdd 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -127,8 +127,6 @@ const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 4fc557c40cc0..66571eafbb1e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -680,14 +680,15 @@ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
-static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
-			      unsigned long nr_segs, loff_t *off)
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
 {
 	int err;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	struct fuse_copy_state cs;
 	unsigned reqsize;
+	struct file *file = iocb->ki_filp;
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
 		return -EPERM;
@@ -761,15 +762,6 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	return err;
 }
 
-static ssize_t fuse_dev_read(struct file *file, char __user *buf,
-			     size_t nbytes, loff_t *off)
-{
-	struct iovec iov;
-	iov.iov_len = nbytes;
-	iov.iov_base = buf;
-	return fuse_dev_readv(file, &iov, 1, off);
-}
-
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -814,15 +806,15 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
  * it from the list and copy the rest of the buffer to the request.
  * The request is finished by calling request_end()
  */
-static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t *off)
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
 {
 	int err;
 	unsigned nbytes = iov_length(iov, nr_segs);
 	struct fuse_req *req;
 	struct fuse_out_header oh;
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(file);
+	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
 	if (!fc)
 		return -EPERM;
 
@@ -898,15 +890,6 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	return err;
 }
 
-static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
-			      size_t nbytes, loff_t *off)
-{
-	struct iovec iov;
-	iov.iov_len = nbytes;
-	iov.iov_base = (char __user *) buf;
-	return fuse_dev_writev(file, &iov, 1, off);
-}
-
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
 	unsigned mask = POLLOUT | POLLWRNORM;
@@ -1041,10 +1024,10 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
 const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
-	.read		= fuse_dev_read,
-	.readv		= fuse_dev_readv,
-	.write		= fuse_dev_write,
-	.writev		= fuse_dev_writev,
+	.read		= do_sync_read,
+	.aio_read	= fuse_dev_read,
+	.write		= do_sync_write,
+	.aio_write	= fuse_dev_write,
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
 	.fasync		= fuse_dev_fasync,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 322e876c35ed..4908c38a5885 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -389,8 +389,6 @@ static const struct file_operations hostfs_file_fops = {
 	.sendfile	= generic_file_sendfile,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.write		= generic_file_write,
 	.mmap		= generic_file_mmap,
 	.open		= hostfs_file_open,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 1c9745be5ada..f535f2911c12 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -108,8 +108,6 @@ const struct file_operations jfs_file_operations = {
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
  	.sendfile	= generic_file_sendfile,
 	.fsync		= jfs_fsync,
 	.release	= jfs_release,
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 0c46f5c86b71..2f9b5a0953ff 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2298,11 +2298,9 @@ const struct file_operations ntfs_file_ops = {
 	.llseek		= generic_file_llseek,	 /* Seek inside file. */
 	.read		= generic_file_read,	 /* Read from file. */
 	.aio_read	= generic_file_aio_read, /* Async read from file. */
-	.readv		= generic_file_readv,	 /* Read from file. */
 #ifdef NTFS_RW
 	.write		= ntfs_file_write,	 /* Write to file. */
 	.aio_write	= ntfs_file_aio_write,	 /* Async write to file. */
-	.writev		= ntfs_file_writev,	 /* Write to file. */
 	/*.release	= ,*/			 /* Last file is closed.  See
 						    fs/ext2/file.c::
 						    ext2_release_file() for
diff --git a/fs/pipe.c b/fs/pipe.c
index f3b6f71e9d0b..2e60e1c8815e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -218,9 +218,10 @@ static struct pipe_buf_operations anon_pipe_buf_ops = {
 };
 
 static ssize_t
-pipe_readv(struct file *filp, const struct iovec *_iov,
-	   unsigned long nr_segs, loff_t *ppos)
+pipe_read(struct kiocb *iocb, const struct iovec *_iov,
+	   unsigned long nr_segs, loff_t pos)
 {
+	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	int do_wakeup;
@@ -330,17 +331,10 @@ redo:
 }
 
 static ssize_t
-pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct iovec iov = { .iov_base = buf, .iov_len = count };
-
-	return pipe_readv(filp, &iov, 1, ppos);
-}
-
-static ssize_t
-pipe_writev(struct file *filp, const struct iovec *_iov,
-	    unsigned long nr_segs, loff_t *ppos)
+pipe_write(struct kiocb *iocb, const struct iovec *_iov,
+	    unsigned long nr_segs, loff_t ppos)
 {
+	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	ssize_t ret;
@@ -509,15 +503,6 @@ out:
 	return ret;
 }
 
-static ssize_t
-pipe_write(struct file *filp, const char __user *buf,
-	   size_t count, loff_t *ppos)
-{
-	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
-
-	return pipe_writev(filp, &iov, 1, ppos);
-}
-
 static ssize_t
 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 {
@@ -736,8 +721,8 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
  */
 const struct file_operations read_fifo_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
 	.write		= bad_pipe_w,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
@@ -749,8 +734,8 @@ const struct file_operations read_fifo_fops = {
 const struct file_operations write_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
@@ -760,10 +745,10 @@ const struct file_operations write_fifo_fops = {
 
 const struct file_operations rdwr_fifo_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
@@ -773,8 +758,8 @@ const struct file_operations rdwr_fifo_fops = {
 
 static struct file_operations read_pipe_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
 	.write		= bad_pipe_w,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
@@ -786,8 +771,8 @@ static struct file_operations read_pipe_fops = {
 static struct file_operations write_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
@@ -797,10 +782,10 @@ static struct file_operations write_pipe_fops = {
 
 static struct file_operations rdwr_pipe_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
diff --git a/fs/read_write.c b/fs/read_write.c
index 679dd535380f..32d54cca9bd9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
+#include "read_write.h"
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -450,6 +451,62 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 
 EXPORT_UNUSED_SYMBOL(iov_shorten);  /*  June 2006  */
 
+ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
+
+	for (;;) {
+		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
+		wait_on_retry_sync_kiocb(&kiocb);
+	}
+
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+/* Do it by hand, with file-ops */
+ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+{
+	struct iovec *vector = iov;
+	ssize_t ret = 0;
+
+	while (nr_segs > 0) {
+		void __user *base;
+		size_t len;
+		ssize_t nr;
+
+		base = vector->iov_base;
+		len = vector->iov_len;
+		vector++;
+		nr_segs--;
+
+		nr = fn(filp, base, len, ppos);
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != len)
+			break;
+	}
+
+	return ret;
+}
+
 /* A write operation does a read from user space and vice versa */
 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 
@@ -457,12 +514,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
 			       const struct iovec __user * uvector,
 			       unsigned long nr_segs, loff_t *pos)
 {
-	typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-	typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *);
-
 	size_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov=iovstack, *vector;
+	struct iovec *iov = iovstack;
 	ssize_t ret;
 	int seg;
 	io_fn_t fn;
@@ -532,39 +586,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
-		fnv = file->f_op->readv;
+		fnv = file->f_op->aio_read;
 	} else {
 		fn = (io_fn_t)file->f_op->write;
-		fnv = file->f_op->writev;
-	}
-	if (fnv) {
-		ret = fnv(file, iov, nr_segs, pos);
-		goto out;
+		fnv = file->f_op->aio_write;
 	}
 
-	/* Do it by hand, with file-ops */
-	ret = 0;
-	vector = iov;
-	while (nr_segs > 0) {
-		void __user * base;
-		size_t len;
-		ssize_t nr;
-
-		base = vector->iov_base;
-		len = vector->iov_len;
-		vector++;
-		nr_segs--;
-
-		nr = fn(file, base, len, pos);
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
-		if (nr < 0) {
-			if (!ret) ret = nr;
-			break;
-		}
-		ret += nr;
-		if (nr != len)
-			break;
-	}
 out:
 	if (iov != iovstack)
 		kfree(iov);
@@ -585,7 +618,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->readv && !file->f_op->read))
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 		return -EINVAL;
 
 	return do_readv_writev(READ, file, vec, vlen, pos);
@@ -598,7 +631,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->writev && !file->f_op->write))
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 		return -EINVAL;
 
 	return do_readv_writev(WRITE, file, vec, vlen, pos);
diff --git a/fs/read_write.h b/fs/read_write.h
new file mode 100644
index 000000000000..d07b954c6e0c
--- /dev/null
+++ b/fs/read_write.h
@@ -0,0 +1,14 @@
+/*
+ * This file is only for sharing some helpers from read_write.c with compat.c.
+ * Don't use anywhere else.
+ */
+
+
+typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
+		unsigned long, loff_t);
+
+ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
+ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 4737971c6a39..d93d8dd1958d 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -123,96 +123,6 @@ xfs_file_aio_write_invis(
 	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
 }
 
-STATIC inline ssize_t
-__xfs_file_readv(
-	struct file		*file,
-	const struct iovec 	*iov,
-	int			ioflags,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	struct inode	*inode = file->f_mapping->host;
-	bhv_vnode_t	*vp = vn_from_inode(inode);
-	struct kiocb	kiocb;
-	ssize_t		rval;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	rval = bhv_vop_read(vp, &kiocb, iov, nr_segs,
-				&kiocb.ki_pos, ioflags, NULL);
-
-	*ppos = kiocb.ki_pos;
-	return rval;
-}
-
-STATIC ssize_t
-xfs_file_readv(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_readv(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-xfs_file_readv_invis(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
-STATIC inline ssize_t
-__xfs_file_writev(
-	struct file		*file,
-	const struct iovec 	*iov,
-	int			ioflags,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	struct inode	*inode = file->f_mapping->host;
-	bhv_vnode_t	*vp = vn_from_inode(inode);
-	struct kiocb	kiocb;
-	ssize_t		rval;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-
-	rval = bhv_vop_write(vp, &kiocb, iov, nr_segs,
-				 &kiocb.ki_pos, ioflags, NULL);
-
-	*ppos = kiocb.ki_pos;
-	return rval;
-}
-
-STATIC ssize_t
-xfs_file_writev(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_writev(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-xfs_file_writev_invis(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_writev(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
 STATIC ssize_t
 xfs_file_sendfile(
 	struct file		*filp,
@@ -540,8 +450,6 @@ const struct file_operations xfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.readv		= xfs_file_readv,
-	.writev		= xfs_file_writev,
 	.aio_read	= xfs_file_aio_read,
 	.aio_write	= xfs_file_aio_write,
 	.sendfile	= xfs_file_sendfile,
@@ -565,8 +473,6 @@ const struct file_operations xfs_invis_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.readv		= xfs_file_readv_invis,
-	.writev		= xfs_file_writev_invis,
 	.aio_read	= xfs_file_aio_read_invis,
 	.aio_write	= xfs_file_aio_write_invis,
 	.sendfile	= xfs_file_sendfile_invis,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 257bae16f545..afb6e6f89a01 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1113,8 +1113,6 @@ struct file_operations {
 	int (*aio_fsync) (struct kiocb *, int datasync);
 	int (*fasync) (int, struct file *, int);
 	int (*lock) (struct file *, int, struct file_lock *);
-	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
-	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
 	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
@@ -1734,10 +1732,6 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
-extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
-	unsigned long nr_segs, loff_t *ppos);
-ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
-			unsigned long nr_segs, loff_t *ppos);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
diff --git a/mm/filemap.c b/mm/filemap.c
index f6c1d22b504f..48497094ae83 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2421,42 +2421,6 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
 }
 EXPORT_SYMBOL(generic_file_write);
 
-ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
-			unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, filp);
-	ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_readv);
-
-ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
-			unsigned long nr_segs, loff_t *ppos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-	mutex_unlock(&inode->i_mutex);
-
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		int err;
-
-		err = sync_page_range(inode, mapping, *ppos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_writev);
-
 /*
  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
diff --git a/net/socket.c b/net/socket.c
index df92e4252749..01918f7a301a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -110,10 +110,6 @@ static long compat_sock_ioctl(struct file *file,
 			      unsigned int cmd, unsigned long arg);
 #endif
 static int sock_fasync(int fd, struct file *filp, int on);
-static ssize_t sock_readv(struct file *file, const struct iovec *vector,
-			  unsigned long count, loff_t *ppos);
-static ssize_t sock_writev(struct file *file, const struct iovec *vector,
-			   unsigned long count, loff_t *ppos);
 static ssize_t sock_sendpage(struct file *file, struct page *page,
 			     int offset, size_t size, loff_t *ppos, int more);
 
@@ -136,8 +132,6 @@ static struct file_operations socket_file_ops = {
 	.open =		sock_no_open,	/* special open code to disallow open via /proc */
 	.release =	sock_close,
 	.fasync =	sock_fasync,
-	.readv =	sock_readv,
-	.writev =	sock_writev,
 	.sendpage =	sock_sendpage,
 	.splice_write = generic_splice_sendpage,
 };
@@ -700,23 +694,6 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
 }
 
-static ssize_t sock_readv(struct file *file, const struct iovec *iov,
-			  unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb iocb;
-	struct sock_iocb siocb;
-	struct msghdr msg;
-	int ret;
-
-	init_sync_kiocb(&iocb, NULL);
-	iocb.private = &siocb;
-
-	ret = do_sock_read(&msg, &iocb, file, iov, nr_segs);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&iocb);
-	return ret;
-}
-
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
@@ -759,23 +736,6 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 	return __sock_sendmsg(iocb, sock, msg, size);
 }
 
-static ssize_t sock_writev(struct file *file, const struct iovec *iov,
-			   unsigned long nr_segs, loff_t *ppos)
-{
-	struct msghdr msg;
-	struct kiocb iocb;
-	struct sock_iocb siocb;
-	int ret;
-
-	init_sync_kiocb(&iocb, NULL);
-	iocb.private = &siocb;
-
-	ret = do_sock_write(&msg, &iocb, file, iov, nr_segs);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&iocb);
-	return ret;
-}
-
 static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			  unsigned long nr_segs, loff_t pos)
 {
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 891d7140553c..37b4b10850ae 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2852,8 +2852,8 @@ static ssize_t snd_pcm_write(struct file *file, const char __user *buf,
 	return result;
 }
 
-static ssize_t snd_pcm_readv(struct file *file, const struct iovec *_vector,
-			     unsigned long count, loff_t * offset)
+static ssize_t snd_pcm_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t pos)
 
 {
 	struct snd_pcm_file *pcm_file;
@@ -2864,22 +2864,22 @@ static ssize_t snd_pcm_readv(struct file *file, const struct iovec *_vector,
 	void __user **bufs;
 	snd_pcm_uframes_t frames;
 
-	pcm_file = file->private_data;
+	pcm_file = iocb->ki_filp->private_data;
 	substream = pcm_file->substream;
 	snd_assert(substream != NULL, return -ENXIO);
 	runtime = substream->runtime;
 	if (runtime->status->state == SNDRV_PCM_STATE_OPEN)
 		return -EBADFD;
-	if (count > 1024 || count != runtime->channels)
+	if (nr_segs > 1024 || nr_segs != runtime->channels)
 		return -EINVAL;
-	if (!frame_aligned(runtime, _vector->iov_len))
+	if (!frame_aligned(runtime, iov->iov_len))
 		return -EINVAL;
-	frames = bytes_to_samples(runtime, _vector->iov_len);
-	bufs = kmalloc(sizeof(void *) * count, GFP_KERNEL);
+	frames = bytes_to_samples(runtime, iov->iov_len);
+	bufs = kmalloc(sizeof(void *) * nr_segs, GFP_KERNEL);
 	if (bufs == NULL)
 		return -ENOMEM;
-	for (i = 0; i < count; ++i)
-		bufs[i] = _vector[i].iov_base;
+	for (i = 0; i < nr_segs; ++i)
+		bufs[i] = iov[i].iov_base;
 	result = snd_pcm_lib_readv(substream, bufs, frames);
 	if (result > 0)
 		result = frames_to_bytes(runtime, result);
@@ -2887,8 +2887,8 @@ static ssize_t snd_pcm_readv(struct file *file, const struct iovec *_vector,
 	return result;
 }
 
-static ssize_t snd_pcm_writev(struct file *file, const struct iovec *_vector,
-			      unsigned long count, loff_t * offset)
+static ssize_t snd_pcm_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
 {
 	struct snd_pcm_file *pcm_file;
 	struct snd_pcm_substream *substream;
@@ -2898,7 +2898,7 @@ static ssize_t snd_pcm_writev(struct file *file, const struct iovec *_vector,
 	void __user **bufs;
 	snd_pcm_uframes_t frames;
 
-	pcm_file = file->private_data;
+	pcm_file = iocb->ki_filp->private_data;
 	substream = pcm_file->substream;
 	snd_assert(substream != NULL, result = -ENXIO; goto end);
 	runtime = substream->runtime;
@@ -2906,17 +2906,17 @@ static ssize_t snd_pcm_writev(struct file *file, const struct iovec *_vector,
 		result = -EBADFD;
 		goto end;
 	}
-	if (count > 128 || count != runtime->channels ||
-	    !frame_aligned(runtime, _vector->iov_len)) {
+	if (nr_segs > 128 || nr_segs != runtime->channels ||
+	    !frame_aligned(runtime, iov->iov_len)) {
 		result = -EINVAL;
 		goto end;
 	}
-	frames = bytes_to_samples(runtime, _vector->iov_len);
-	bufs = kmalloc(sizeof(void *) * count, GFP_KERNEL);
+	frames = bytes_to_samples(runtime, iov->iov_len);
+	bufs = kmalloc(sizeof(void *) * nr_segs, GFP_KERNEL);
 	if (bufs == NULL)
 		return -ENOMEM;
-	for (i = 0; i < count; ++i)
-		bufs[i] = _vector[i].iov_base;
+	for (i = 0; i < nr_segs; ++i)
+		bufs[i] = iov[i].iov_base;
 	result = snd_pcm_lib_writev(substream, bufs, frames);
 	if (result > 0)
 		result = frames_to_bytes(runtime, result);
@@ -3426,7 +3426,7 @@ struct file_operations snd_pcm_f_ops[2] = {
 	{
 		.owner =		THIS_MODULE,
 		.write =		snd_pcm_write,
-		.writev =		snd_pcm_writev,
+		.aio_write =		snd_pcm_aio_write,
 		.open =			snd_pcm_playback_open,
 		.release =		snd_pcm_release,
 		.poll =			snd_pcm_playback_poll,
@@ -3438,7 +3438,7 @@ struct file_operations snd_pcm_f_ops[2] = {
 	{
 		.owner =		THIS_MODULE,
 		.read =			snd_pcm_read,
-		.readv =		snd_pcm_readv,
+		.aio_read =		snd_pcm_aio_read,
 		.open =			snd_pcm_capture_open,
 		.release =		snd_pcm_release,
 		.poll =			snd_pcm_capture_poll,
-- 
cgit v1.2.3


From 543ade1fc901db4c3dbe9fb27241fb977f1f3eea Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Sat, 30 Sep 2006 23:28:48 -0700
Subject: [PATCH] Streamline generic_file_* interfaces and filemap cleanups

This patch cleans up generic_file_*_read/write() interfaces.  Christoph
Hellwig gave me the idea for this clean ups.

In a nutshell, all filesystems should set .aio_read/.aio_write methods and use
do_sync_read/ do_sync_write() as their .read/.write methods.  This allows us
to cleanup all variants of generic_file_* routines.

Final available interfaces:

generic_file_aio_read() - read handler
generic_file_aio_write() - write handler
generic_file_aio_write_nolock() - no lock write handler

__generic_file_aio_write_nolock() - internal worker routine

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/raw.c         | 15 ++------
 fs/adfs/file.c             |  6 ++--
 fs/affs/file.c             |  6 ++--
 fs/bfs/file.c              |  6 ++--
 fs/block_dev.c             | 12 ++-----
 fs/ext2/file.c             |  4 +--
 fs/fuse/file.c             |  6 ++--
 fs/hfs/inode.c             |  6 ++--
 fs/hfsplus/inode.c         |  6 ++--
 fs/hostfs/hostfs_kern.c    |  4 +--
 fs/hpfs/file.c             |  6 ++--
 fs/jffs/inode-v23.c        |  6 ++--
 fs/jffs2/file.c            |  6 ++--
 fs/jfs/file.c              |  4 +--
 fs/minix/file.c            |  6 ++--
 fs/ntfs/file.c             |  2 +-
 fs/qnx4/file.c             |  6 ++--
 fs/ramfs/file-mmu.c        |  6 ++--
 fs/ramfs/file-nommu.c      |  6 ++--
 fs/read_write.c            |  3 +-
 fs/smbfs/file.c            | 24 ++++++++-----
 fs/sysv/file.c             |  6 ++--
 fs/udf/file.c              | 16 +++++----
 fs/ufs/file.c              |  6 ++--
 fs/xfs/linux-2.6/xfs_lrw.c |  4 ++-
 include/linux/fs.h         |  5 ---
 mm/filemap.c               | 87 +++-------------------------------------------
 27 files changed, 105 insertions(+), 165 deletions(-)

(limited to 'mm')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 490db531e2d8..89b718e326e5 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -238,21 +238,10 @@ out:
 	return err;
 }
 
-static ssize_t raw_file_write(struct file *file, const char __user *buf,
-				   size_t count, loff_t *ppos)
-{
-	struct iovec local_iov = {
-		.iov_base = (char __user *)buf,
-		.iov_len = count
-	};
-
-	return generic_file_write_nolock(file, &local_iov, 1, ppos);
-}
-
 static const struct file_operations raw_fops = {
-	.read	=	generic_file_read,
+	.read	=	do_sync_read,
 	.aio_read = 	generic_file_aio_read,
-	.write	=	raw_file_write,
+	.write	=	do_sync_write,
 	.aio_write = 	generic_file_aio_write_nolock,
 	.open	=	raw_open,
 	.release=	raw_release,
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 1014b9f2117b..6101ea679cb1 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -27,10 +27,12 @@
 
 const struct file_operations adfs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
 	.fsync		= file_fsync,
-	.write		= generic_file_write,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.sendfile	= generic_file_sendfile,
 };
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 3de8590e4f6a..05b5e22de759 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -27,8 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp);
 
 const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 3d5aca28a0a0..a9164a87f8de 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -19,8 +19,10 @@
 
 const struct file_operations bfs_file_operations = {
 	.llseek 	= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0f143094ef1d..bc8f27cc4483 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1154,14 +1154,6 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 	return blkdev_put(bdev);
 }
 
-static ssize_t blkdev_file_write(struct file *file, const char __user *buf,
-				   size_t count, loff_t *ppos)
-{
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
-
-	return generic_file_write_nolock(file, &local_iov, 1, ppos);
-}
-
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
@@ -1181,8 +1173,8 @@ const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
-	.read		= generic_file_read,
-	.write		= blkdev_file_write,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
   	.aio_read	= generic_file_aio_read,
   	.aio_write	= generic_file_aio_write_nolock,
 	.mmap		= generic_file_mmap,
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index e893e2be9ed4..2dba473c524a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -41,8 +41,8 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
  */
 const struct file_operations ext2_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.ioctl		= ext2_ioctl,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5c4fcd1dbf59..183626868eea 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -753,8 +753,10 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= fuse_file_mmap,
 	.open		= fuse_open,
 	.flush		= fuse_flush,
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d05641c35fc9..02f5573e0349 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -601,8 +601,10 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 
 static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 	.fsync		= file_fsync,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0eb1a6092668..9e3675249633 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -282,8 +282,10 @@ static struct inode_operations hfsplus_file_inode_operations = {
 
 static const struct file_operations hfsplus_file_operations = {
 	.llseek 	= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 	.fsync		= file_fsync,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 4908c38a5885..b6bd33ca3731 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -385,11 +385,11 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 static const struct file_operations hostfs_file_fops = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
 	.sendfile	= generic_file_sendfile,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
-	.write		= generic_file_write,
+	.write		= do_sync_write,
 	.mmap		= generic_file_mmap,
 	.open		= hostfs_file_open,
 	.release	= NULL,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d9eb19b7b8ae..8b94d24855f0 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -113,7 +113,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
 {
 	ssize_t retval;
 
-	retval = generic_file_write(file, buf, count, ppos);
+	retval = do_sync_write(file, buf, count, ppos);
 	if (retval > 0)
 		hpfs_i(file->f_dentry->d_inode)->i_dirty = 1;
 	return retval;
@@ -122,8 +122,10 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
 const struct file_operations hpfs_file_ops =
 {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.write		= hpfs_file_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.release	= hpfs_file_release,
 	.fsync		= hpfs_file_fsync,
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index f5cf9c93e243..068ef0de8de2 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1632,8 +1632,10 @@ static const struct file_operations jffs_file_operations =
 {
 	.open		= generic_file_open,
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.ioctl		= jffs_ioctl,
 	.mmap		= generic_file_readonly_mmap,
 	.fsync		= jffs_fsync,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 3ed6e3e120b6..242875f77cb3 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -42,8 +42,10 @@ const struct file_operations jffs2_file_operations =
 {
 	.llseek =	generic_file_llseek,
 	.open =		generic_file_open,
-	.read =		generic_file_read,
-	.write =	generic_file_write,
+ 	.read =		do_sync_read,
+ 	.aio_read =	generic_file_aio_read,
+ 	.write =	do_sync_write,
+ 	.aio_write =	generic_file_aio_write,
 	.ioctl =	jffs2_ioctl,
 	.mmap =		generic_file_readonly_mmap,
 	.fsync =	jffs2_fsync,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index f535f2911c12..976e90dc2d1b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -103,8 +103,8 @@ struct inode_operations jfs_file_inode_operations = {
 const struct file_operations jfs_file_operations = {
 	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
-	.write		= generic_file_write,
-	.read		= generic_file_read,
+	.write		= do_sync_write,
+	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 420b32882a10..40eac2e60d25 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -17,8 +17,10 @@ int minix_sync_file(struct file *, struct dentry *, int);
 
 const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= minix_sync_file,
 	.sendfile	= generic_file_sendfile,
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 2f9b5a0953ff..ae2fe0016d2c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2296,7 +2296,7 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 
 const struct file_operations ntfs_file_ops = {
 	.llseek		= generic_file_llseek,	 /* Seek inside file. */
-	.read		= generic_file_read,	 /* Read from file. */
+	.read		= do_sync_read,		 /* Read from file. */
 	.aio_read	= generic_file_aio_read, /* Async read from file. */
 #ifdef NTFS_RW
 	.write		= ntfs_file_write,	 /* Write to file. */
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 62af4b1348bd..467e5ac7280e 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -22,11 +22,13 @@
 const struct file_operations qnx4_file_operations =
 {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 #ifdef CONFIG_QNX4FS_RW
-	.write		= generic_file_write,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.fsync		= qnx4_sync_file,
 #endif
 };
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 86f14cacf641..0947fb57dcf3 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -33,8 +33,10 @@ const struct address_space_operations ramfs_aops = {
 };
 
 const struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= simple_sync_file,
 	.sendfile	= generic_file_sendfile,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 677139b48e00..bfe5dbf1002e 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -36,8 +36,10 @@ const struct address_space_operations ramfs_aops = {
 const struct file_operations ramfs_file_operations = {
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
-	.read			= generic_file_read,
-	.write			= generic_file_write,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.write			= do_sync_write,
+	.aio_write		= generic_file_aio_write,
 	.fsync			= simple_sync_file,
 	.sendfile		= generic_file_sendfile,
 	.llseek			= generic_file_llseek,
diff --git a/fs/read_write.c b/fs/read_write.c
index 32d54cca9bd9..4ed839bcb91c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -22,7 +22,8 @@
 
 const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_readonly_mmap,
 	.sendfile	= generic_file_sendfile,
 };
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dae67048baba..50784d13c87b 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -214,13 +214,15 @@ smb_updatepage(struct file *file, struct page *page, unsigned long offset,
 }
 
 static ssize_t
-smb_file_read(struct file * file, char __user * buf, size_t count, loff_t *ppos)
+smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			unsigned long nr_segs, loff_t pos)
 {
+	struct file * file = iocb->ki_filp;
 	struct dentry * dentry = file->f_dentry;
 	ssize_t	status;
 
 	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-		(unsigned long) count, (unsigned long) *ppos);
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
 
 	status = smb_revalidate_inode(dentry);
 	if (status) {
@@ -233,7 +235,7 @@ smb_file_read(struct file * file, char __user * buf, size_t count, loff_t *ppos)
 		(long)dentry->d_inode->i_size,
 		dentry->d_inode->i_flags, dentry->d_inode->i_atime);
 
-	status = generic_file_read(file, buf, count, ppos);
+	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
 out:
 	return status;
 }
@@ -317,14 +319,16 @@ const struct address_space_operations smb_file_aops = {
  * Write to a file (through the page cache).
  */
 static ssize_t
-smb_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
 {
+	struct file * file = iocb->ki_filp;
 	struct dentry * dentry = file->f_dentry;
 	ssize_t	result;
 
 	VERBOSE("file %s/%s, count=%lu@%lu\n",
 		DENTRY_PATH(dentry),
-		(unsigned long) count, (unsigned long) *ppos);
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
 
 	result = smb_revalidate_inode(dentry);
 	if (result) {
@@ -337,8 +341,8 @@ smb_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 	if (result)
 		goto out;
 
-	if (count > 0) {
-		result = generic_file_write(file, buf, count, ppos);
+	if (iocb->ki_left > 0) {
+		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
 			(long) file->f_pos, (long) dentry->d_inode->i_size,
 			dentry->d_inode->i_mtime, dentry->d_inode->i_atime);
@@ -402,8 +406,10 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 const struct file_operations smb_file_operations =
 {
 	.llseek		= remote_llseek,
-	.read		= smb_file_read,
-	.write		= smb_file_write,
+	.read		= do_sync_read,
+	.aio_read	= smb_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= smb_file_aio_write,
 	.ioctl		= smb_ioctl,
 	.mmap		= smb_file_mmap,
 	.open		= smb_file_open,
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index a59e303135fa..47a4b728f15b 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -21,8 +21,10 @@
  */
 const struct file_operations sysv_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= sysv_sync_file,
 	.sendfile	= generic_file_sendfile,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index a59e5f33daf6..7aedd552cba1 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -103,19 +103,21 @@ const struct address_space_operations udf_adinicb_aops = {
 	.commit_write		= udf_adinicb_commit_write,
 };
 
-static ssize_t udf_file_write(struct file * file, const char __user * buf,
-	size_t count, loff_t *ppos)
+static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t ppos)
 {
 	ssize_t retval;
+	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
 	int err, pos;
+	size_t count = iocb->ki_left;
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_IN_ICB)
 	{
 		if (file->f_flags & O_APPEND)
 			pos = inode->i_size;
 		else
-			pos = *ppos;
+			pos = ppos;
 
 		if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) +
 			pos + count))
@@ -136,7 +138,7 @@ static ssize_t udf_file_write(struct file * file, const char __user * buf,
 		}
 	}
 
-	retval = generic_file_write(file, buf, count, ppos);
+	retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
 
 	if (retval > 0)
 		mark_inode_dirty(inode);
@@ -249,11 +251,13 @@ static int udf_release_file(struct inode * inode, struct file * filp)
 }
 
 const struct file_operations udf_file_operations = {
-	.read			= generic_file_read,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
 	.ioctl			= udf_ioctl,
 	.open			= generic_file_open,
 	.mmap			= generic_file_mmap,
-	.write			= udf_file_write,
+	.write			= do_sync_write,
+	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
 	.fsync			= udf_fsync_file,
 	.sendfile		= generic_file_sendfile,
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a9c6e5f04fae..1e096323bad4 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -53,8 +53,10 @@ static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
  
 const struct file_operations ufs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
 	.fsync		= ufs_sync_file,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 55992b40353c..fa842f1c9fa2 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -279,7 +279,9 @@ xfs_read(
 
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 				(void *)iovp, segs, *offset, ioflags);
-	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
+
+	iocb->ki_pos = *offset;
+	ret = generic_file_aio_read(iocb, iovp, segs, *offset);
 	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
 		ret = wait_on_sync_kiocb(iocb);
 	if (ret > 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index afb6e6f89a01..011129f8803e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1699,11 +1699,8 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 extern int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
-extern ssize_t generic_file_read(struct file *, char __user *, size_t, loff_t *);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
-extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
-extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
 extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t);
@@ -1713,8 +1710,6 @@ extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, loff_t *, size_t, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
-ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos);
 extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 extern void do_generic_mapping_read(struct address_space *mapping,
 				    struct file_ra_state *, struct file *,
diff --git a/mm/filemap.c b/mm/filemap.c
index 48497094ae83..ec469235985d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1149,13 +1149,14 @@ success:
  * that can use the page cache directly.
  */
 ssize_t
-__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t *ppos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
 	ssize_t retval;
 	unsigned long seg;
 	size_t count;
+	loff_t *ppos = &iocb->ki_pos;
 
 	count = 0;
 	for (seg = 0; seg < nr_segs; seg++) {
@@ -1179,7 +1180,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (filp->f_flags & O_DIRECT) {
-		loff_t pos = *ppos, size;
+		loff_t size;
 		struct address_space *mapping;
 		struct inode *inode;
 
@@ -1223,32 +1224,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 out:
 	return retval;
 }
-EXPORT_SYMBOL(__generic_file_aio_read);
-
-ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
-{
-	BUG_ON(iocb->ki_pos != pos);
-	return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos);
-}
 EXPORT_SYMBOL(generic_file_aio_read);
 
-ssize_t
-generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, filp);
-	ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_read);
-
 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 {
 	ssize_t written;
@@ -2339,38 +2316,6 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
 }
 EXPORT_SYMBOL(generic_file_aio_write_nolock);
 
-static ssize_t
-__generic_file_write_nolock(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	return ret;
-}
-
-ssize_t
-generic_file_write_nolock(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-	ssize_t ret;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, *ppos);
-	if (-EIOCBQUEUED == ret)
-		ret = wait_on_sync_kiocb(&kiocb);
-	*ppos = kiocb.ki_pos;
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_write_nolock);
-
 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
@@ -2397,30 +2342,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_aio_write);
 
-ssize_t generic_file_write(struct file *file, const char __user *buf,
-			   size_t count, loff_t *ppos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t	ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-					.iov_len = count };
-
-	mutex_lock(&inode->i_mutex);
-	ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-	mutex_unlock(&inode->i_mutex);
-
-	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		ssize_t err;
-
-		err = sync_page_range(inode, mapping, *ppos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(generic_file_write);
-
 /*
  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
  * went wrong during pagecache shootdown.
-- 
cgit v1.2.3


From 9a53c3a783c2fa9b969628e65695c11c3e51e673 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 30 Sep 2006 23:29:03 -0700
Subject: [PATCH] r/o bind mounts: unlink: monitor i_nlink

When a filesystem decrements i_nlink to zero, it means that a write must be
performed in order to drop the inode from the filesystem.

We're shortly going to have keep filesystems from being remounted r/o between
the time that this i_nlink decrement and that write occurs.

So, add a little helper function to do the decrements.  We'll tie into it in a
bit to note when i_nlink hits zero.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/inode.c |  7 ++++---
 fs/autofs/root.c         |  2 +-
 fs/autofs4/root.c        |  2 +-
 fs/bfs/dir.c             |  9 +++------
 fs/cifs/inode.c          | 10 +++++-----
 fs/coda/dir.c            |  4 ++--
 fs/ext2/namei.c          |  2 +-
 fs/ext3/namei.c          | 14 +++++++-------
 fs/hfs/dir.c             |  2 +-
 fs/hfsplus/dir.c         |  2 +-
 fs/hpfs/namei.c          |  6 +++---
 fs/jffs/inode-v23.c      |  3 +--
 fs/jffs2/dir.c           |  6 +++---
 fs/jfs/namei.c           | 14 ++++++--------
 fs/libfs.c               | 10 +++++-----
 fs/minix/namei.c         |  2 +-
 fs/msdos/namei.c         |  9 ++++-----
 fs/nfs/dir.c             |  6 +++---
 fs/ocfs2/namei.c         |  4 ++--
 fs/qnx4/namei.c          |  6 ++----
 fs/reiserfs/namei.c      |  6 +++---
 fs/sysv/namei.c          |  2 +-
 fs/udf/namei.c           | 18 ++++++------------
 fs/ufs/namei.c           |  2 +-
 fs/vfat/namei.c          |  9 ++++-----
 include/linux/fs.h       |  7 ++++++-
 ipc/mqueue.c             |  2 +-
 mm/shmem.c               | 10 +++++-----
 28 files changed, 83 insertions(+), 93 deletions(-)

(limited to 'mm')

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index df3d152f0493..88002e45a6b4 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -332,7 +332,7 @@ static int usbfs_unlink (struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	mutex_lock(&inode->i_mutex);
-	dentry->d_inode->i_nlink--;
+	drop_nlink(dentry->d_inode);
 	dput(dentry);
 	mutex_unlock(&inode->i_mutex);
 	d_delete(dentry);
@@ -347,10 +347,11 @@ static int usbfs_rmdir(struct inode *dir, struct dentry *dentry)
 	mutex_lock(&inode->i_mutex);
 	dentry_unhash(dentry);
 	if (usbfs_empty(dentry)) {
-		dentry->d_inode->i_nlink -= 2;
+		drop_nlink(dentry->d_inode);
+		drop_nlink(dentry->d_inode);
 		dput(dentry);
 		inode->i_flags |= S_DEAD;
-		dir->i_nlink--;
+		drop_nlink(dir);
 		error = 0;
 	}
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 9cac08d6a873..54ad70731927 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -414,7 +414,7 @@ static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
 	autofs_hash_delete(ent);
-	dir->i_nlink--;
+	drop_nlink(dir);
 	d_drop(dentry);
 	unlock_kernel();
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 563ef9d7da9f..348bec0982b0 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -676,7 +676,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	dentry->d_inode->i_nlink = 0;
 
 	if (dir->i_nlink)
-		dir->i_nlink--;
+		drop_nlink(dir);
 
 	return 0;
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index dcf04cb13283..ce05d1643dd1 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -117,8 +117,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 
 	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino);
 	if (err) {
-		inode->i_nlink--;
-		mark_inode_dirty(inode);
+		inode_dec_link_count(inode);
 		iput(inode);
 		unlock_kernel();
 		return err;
@@ -196,9 +195,8 @@ static int bfs_unlink(struct inode * dir, struct dentry * dentry)
 	mark_buffer_dirty(bh);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
 	inode->i_ctime = dir->i_ctime;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	error = 0;
 
 out_brelse:
@@ -249,9 +247,8 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(old_dir);
 	if (new_inode) {
-		new_inode->i_nlink--;
 		new_inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(new_inode);
+		inode_dec_link_count(new_inode);
 	}
 	mark_buffer_dirty(old_bh);
 	error = 0;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 05f874c7441b..74441a17e186 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -590,7 +590,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 
 	if (!rc) {
 		if (direntry->d_inode)
-			direntry->d_inode->i_nlink--;
+			drop_nlink(direntry->d_inode);
 	} else if (rc == -ENOENT) {
 		d_drop(direntry);
 	} else if (rc == -ETXTBSY) {
@@ -609,7 +609,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			CIFSSMBClose(xid, pTcon, netfid);
 			if (direntry->d_inode)
-				direntry->d_inode->i_nlink--;
+				drop_nlink(direntry->d_inode);
 		}
 	} else if (rc == -EACCES) {
 		/* try only if r/o attribute set in local lookup data? */
@@ -663,7 +663,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (!rc) {
 				if (direntry->d_inode)
-					direntry->d_inode->i_nlink--;
+					drop_nlink(direntry->d_inode);
 			} else if (rc == -ETXTBSY) {
 				int oplock = FALSE;
 				__u16 netfid;
@@ -684,7 +684,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
 					CIFSSMBClose(xid, pTcon, netfid);
 					if (direntry->d_inode)
-			                        direntry->d_inode->i_nlink--;
+						drop_nlink(direntry->d_inode);
 				}
 			/* BB if rc = -ETXTBUSY goto the rename logic BB */
 			}
@@ -816,7 +816,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 	if (!rc) {
-		inode->i_nlink--;
+		drop_nlink(inode);
 		i_size_write(direntry->d_inode,0);
 		direntry->d_inode->i_nlink = 0;
 	}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 8651ea6a23b7..0a2fd8bb7579 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -367,7 +367,7 @@ int coda_unlink(struct inode *dir, struct dentry *de)
         }
 
 	coda_dir_changed(dir, 0);
-	de->d_inode->i_nlink--;
+	drop_nlink(de->d_inode);
 	unlock_kernel();
 
         return 0;
@@ -394,7 +394,7 @@ int coda_rmdir(struct inode *dir, struct dentry *de)
         }
 
 	coda_dir_changed(dir, -1);
-	de->d_inode->i_nlink--;
+	drop_nlink(de->d_inode);
 	d_delete(de);
 	unlock_kernel();
 
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 4ca824985321..e1af5b4cf80c 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -326,7 +326,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		ext2_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 235e77b52ea5..14c55adfae83 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1621,7 +1621,7 @@ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
 
 static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
 {
-	inode->i_nlink--;
+	drop_nlink(inode);
 }
 
 static int ext3_add_nondir(handle_t *handle,
@@ -1743,7 +1743,7 @@ retry:
 	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
 	dir_block = ext3_bread (handle, inode, 0, 1, &err);
 	if (!dir_block) {
-		inode->i_nlink--; /* is this nlink == 0? */
+		drop_nlink(inode); /* is this nlink == 0? */
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -2053,7 +2053,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 	ext3_orphan_add(handle, inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	ext3_mark_inode_dirty(handle, inode);
-	dir->i_nlink--;
+	drop_nlink(dir);
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 
@@ -2104,7 +2104,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
-	inode->i_nlink--;
+	drop_nlink(inode);
 	if (!inode->i_nlink)
 		ext3_orphan_add(handle, inode);
 	inode->i_ctime = dir->i_ctime;
@@ -2326,7 +2326,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	}
 
 	if (new_inode) {
-		new_inode->i_nlink--;
+		drop_nlink(new_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 	}
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
@@ -2337,9 +2337,9 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
 		ext3_journal_dirty_metadata(handle, dir_bh);
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (new_inode) {
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		} else {
 			new_dir->i_nlink++;
 			ext3_update_dx_flag(new_dir);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 7cd8cc03aea7..cfef6ad529a7 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -246,7 +246,7 @@ static int hfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (res)
 		return res;
 
-	inode->i_nlink--;
+	drop_nlink(inode);
 	hfs_delete_inode(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 1f9ece0de326..9ceb0dfaa1cc 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -338,7 +338,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		return res;
 
 	if (inode->i_nlink > 0)
-		inode->i_nlink--;
+		drop_nlink(inode);
 	hfsplus_delete_inode(inode);
 	if (inode->i_ino != cnid && !inode->i_nlink) {
 		if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 59e7dc182a0c..4078b0becc5e 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -434,7 +434,7 @@ again:
 		unlock_kernel();
 		return -ENOSPC;
 	default:
-		inode->i_nlink--;
+		drop_nlink(inode);
 		err = 0;
 	}
 	goto out;
@@ -494,7 +494,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 		err = -ENOSPC;
 		break;
 	default:
-		dir->i_nlink--;
+		drop_nlink(dir);
 		inode->i_nlink = 0;
 		err = 0;
 	}
@@ -636,7 +636,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
 	if (S_ISDIR(i->i_mode)) {
 		new_dir->i_nlink++;
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 	}
 	if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) {
 		fnode->up = new_dir->i_ino;
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 068ef0de8de2..3f7899ea4cba 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1052,9 +1052,8 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type)
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
 	inode->i_ctime = dir->i_ctime;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 
 	d_delete(dentry);	/* This also frees the inode */
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index edd8371fc6a5..a5e9f2205b33 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -615,7 +615,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	}
 	ret = jffs2_unlink(dir_i, dentry);
 	if (!ret)
-		dir_i->i_nlink--;
+		drop_nlink(dir_i);
 	return ret;
 }
 
@@ -823,7 +823,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
 	if (victim_f) {
 		/* There was a victim. Kill it off nicely */
-		new_dentry->d_inode->i_nlink--;
+		drop_nlink(new_dentry->d_inode);
 		/* Don't oops if the victim was a dirent pointing to an
 		   inode which didn't exist. */
 		if (victim_f->inocache) {
@@ -862,7 +862,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	}
 
 	if (S_ISDIR(old_dentry->d_inode->i_mode))
-		old_dir_i->i_nlink--;
+		drop_nlink(old_dir_i);
 
 	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
 
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 295268ad231b..088b85976ac0 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -393,9 +393,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	/* update parent directory's link count corresponding
 	 * to ".." entry of the target directory deleted
 	 */
-	dip->i_nlink--;
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
-	mark_inode_dirty(dip);
+	inode_dec_link_count(dip);
 
 	/*
 	 * OS/2 could have created EA and/or ACL
@@ -515,8 +514,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	mark_inode_dirty(dip);
 
 	/* update target's inode */
-	ip->i_nlink--;
-	mark_inode_dirty(ip);
+	inode_dec_link_count(ip);
 
 	/*
 	 *      commit zero link count object
@@ -835,7 +833,7 @@ static int jfs_link(struct dentry *old_dentry,
 	rc = txCommit(tid, 2, &iplist[0], 0);
 
 	if (rc) {
-		ip->i_nlink--;
+		ip->i_nlink--; /* never instantiated */
 		iput(ip);
 	} else
 		d_instantiate(dentry, ip);
@@ -1155,9 +1153,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			      old_ip->i_ino, JFS_RENAME);
 		if (rc)
 			goto out4;
-		new_ip->i_nlink--;
+		drop_nlink(new_ip);
 		if (S_ISDIR(new_ip->i_mode)) {
-			new_ip->i_nlink--;
+			drop_nlink(new_ip);
 			if (new_ip->i_nlink) {
 				mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
 				if (old_dir != new_dir)
@@ -1223,7 +1221,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out4;
 	}
 	if (S_ISDIR(old_ip->i_mode)) {
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (old_dir != new_dir) {
 			/*
 			 * Change inode number of parent for moved directory
diff --git a/fs/libfs.c b/fs/libfs.c
index 3793aaa14577..9204feba75ac 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -275,7 +275,7 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink--;
+	drop_nlink(inode);
 	dput(dentry);
 	return 0;
 }
@@ -285,9 +285,9 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!simple_empty(dentry))
 		return -ENOTEMPTY;
 
-	dentry->d_inode->i_nlink--;
+	drop_nlink(dentry->d_inode);
 	simple_unlink(dir, dentry);
-	dir->i_nlink--;
+	drop_nlink(dir);
 	return 0;
 }
 
@@ -303,9 +303,9 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new_dentry->d_inode) {
 		simple_unlink(new_dir, new_dentry);
 		if (they_are_dirs)
-			old_dir->i_nlink--;
+			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		new_dir->i_nlink++;
 	}
 
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 5b6a4540a05b..299bb66e3bde 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -249,7 +249,7 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 		minix_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index d220165d4918..635613f2f65a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -343,7 +343,7 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
 	if (err)
 		goto out;
-	dir->i_nlink--;
+	drop_nlink(dir);
 
 	inode->i_nlink = 0;
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -549,7 +549,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 			if (err)
 				goto error_dotdot;
 		}
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (!new_inode)
 			new_dir->i_nlink++;
 	}
@@ -566,10 +566,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		mark_inode_dirty(old_dir);
 
 	if (new_inode) {
+		drop_nlink(new_inode);
 		if (is_dir)
-			new_inode->i_nlink -= 2;
-		else
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		new_inode->i_ctime = ts;
 	}
 out:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7432f1a43f3d..26eecb86f9b0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -843,7 +843,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	nfs_inode_return_delegation(inode);
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		lock_kernel();
-		inode->i_nlink--;
+		drop_nlink(inode);
 		nfs_complete_unlink(dentry);
 		unlock_kernel();
 	}
@@ -1401,7 +1401,7 @@ static int nfs_safe_remove(struct dentry *dentry)
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
 		/* The VFS may want to delete this inode */
 		if (error == 0)
-			inode->i_nlink--;
+			drop_nlink(inode);
 		nfs_mark_for_revalidate(inode);
 		nfs_end_data_update(inode);
 	} else
@@ -1639,7 +1639,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out;
 		}
 	} else
-		new_inode->i_nlink--;
+		drop_nlink(new_inode);
 
 go_ahead:
 	/*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849c3b4bb94a..40f83f53053a 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -739,7 +739,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	err = ocfs2_journal_dirty(handle, fe_bh);
 	if (err < 0) {
 		le16_add_cpu(&fe->i_links_count, -1);
-		inode->i_nlink--;
+		drop_nlink(inode);
 		mlog_errno(err);
 		goto bail;
 	}
@@ -749,7 +749,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 			      parent_fe_bh, de_bh);
 	if (err) {
 		le16_add_cpu(&fe->i_links_count, -1);
-		inode->i_nlink--;
+		drop_nlink(inode);
 		mlog_errno(err);
 		goto bail;
 	}
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index c3d83f67154a..ad5afa4ea8f3 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -189,8 +189,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_nlink = 0;
 	mark_inode_dirty(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-	dir->i_nlink--;
-	mark_inode_dirty(dir);
+	inode_dec_link_count(dir);
 	retval = 0;
 
       end_rmdir:
@@ -234,9 +233,8 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
 	mark_buffer_dirty(bh);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
 	inode->i_ctime = dir->i_ctime;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	retval = 0;
 
 end_unlink:
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c61710e49c62..c76d427e027b 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -20,7 +20,7 @@
 #include <linux/quotaops.h>
 
 #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
-#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
+#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
 
 // directory item contains array of entry headers. This performs
 // binary search through that array
@@ -994,7 +994,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 		inode->i_nlink = 1;
 	}
 
-	inode->i_nlink--;
+	drop_nlink(inode);
 
 	/*
 	 * we schedule before doing the add_save_link call, save the link
@@ -1475,7 +1475,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (S_ISDIR(new_dentry_inode->i_mode)) {
 			new_dentry_inode->i_nlink = 0;
 		} else {
-			new_dentry_inode->i_nlink--;
+			drop_nlink(new_dentry_inode);
 		}
 		new_dentry_inode->i_ctime = ctime;
 		savelink = new_dentry_inode->i_nlink;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b8a73f716fbe..f7c08db8e34c 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -250,7 +250,7 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 		sysv_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ab9a7629d23e..d14d25534aa8 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -878,8 +878,7 @@ static int udf_rmdir(struct inode * dir, struct dentry * dentry)
 			inode->i_nlink);
 	inode->i_nlink = 0;
 	inode->i_size = 0;
-	mark_inode_dirty(inode);
-	dir->i_nlink --;
+	inode_dec_link_count(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
 	mark_inode_dirty(dir);
 
@@ -923,8 +922,7 @@ static int udf_unlink(struct inode * dir, struct dentry * dentry)
 		goto end_unlink;
 	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	inode->i_ctime = dir->i_ctime;
 	retval = 0;
 
@@ -1101,8 +1099,7 @@ out:
 	return err;
 
 out_no_entry:
-	inode->i_nlink--;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	iput(inode);
 	goto out;
 }
@@ -1261,9 +1258,8 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 
 	if (new_inode)
 	{
-		new_inode->i_nlink--;
 		new_inode->i_ctime = current_fs_time(new_inode->i_sb);
-		mark_inode_dirty(new_inode);
+		inode_dec_link_count(new_inode);
 	}
 	old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb);
 	mark_inode_dirty(old_dir);
@@ -1279,12 +1275,10 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 		}
 		else
 			mark_buffer_dirty_inode(dir_bh, old_inode);
-		old_dir->i_nlink --;
-		mark_inode_dirty(old_dir);
+		inode_dec_link_count(old_dir);
 		if (new_inode)
 		{
-			new_inode->i_nlink --;
-			mark_inode_dirty(new_inode);
+			inode_dec_link_count(new_inode);
 		}
 		else
 		{
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index d344b411e261..e84c0ecf0730 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -308,7 +308,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 9a8f48bae956..090d74ffa061 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -782,7 +782,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
 	if (err)
 		goto out;
-	dir->i_nlink--;
+	drop_nlink(dir);
 
 	inode->i_nlink = 0;
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
@@ -930,7 +930,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (err)
 				goto error_dotdot;
 		}
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (!new_inode)
  			new_dir->i_nlink++;
 	}
@@ -947,10 +947,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		mark_inode_dirty(old_dir);
 
 	if (new_inode) {
+		drop_nlink(new_inode);
 		if (is_dir)
-			new_inode->i_nlink -= 2;
-		else
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		new_inode->i_ctime = ts;
 	}
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4bb70871873f..26d3c61116c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1225,9 +1225,14 @@ static inline void inode_inc_link_count(struct inode *inode)
 	mark_inode_dirty(inode);
 }
 
-static inline void inode_dec_link_count(struct inode *inode)
+static inline void drop_nlink(struct inode *inode)
 {
 	inode->i_nlink--;
+}
+
+static inline void inode_dec_link_count(struct inode *inode)
+{
+	drop_nlink(inode);
 	mark_inode_dirty(inode);
 }
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 840f8a6fb85f..10aa8eeeb112 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -307,7 +307,7 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_ctime = dir->i_mtime = dir->i_atime = CURRENT_TIME;
 	dir->i_size -= DIRENT_SIZE;
-  	inode->i_nlink--;
+  	drop_nlink(inode);
   	dput(dentry);
   	return 0;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index b96de69f236b..908dd947b1ea 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1772,7 +1772,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink--;
+	drop_nlink(inode);
 	dput(dentry);	/* Undo the count from "create" - this does all the work */
 	return 0;
 }
@@ -1782,8 +1782,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!simple_empty(dentry))
 		return -ENOTEMPTY;
 
-	dentry->d_inode->i_nlink--;
-	dir->i_nlink--;
+	drop_nlink(dentry->d_inode);
+	drop_nlink(dir);
 	return shmem_unlink(dir, dentry);
 }
 
@@ -1804,9 +1804,9 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
 	if (new_dentry->d_inode) {
 		(void) shmem_unlink(new_dir, new_dentry);
 		if (they_are_dirs)
-			old_dir->i_nlink--;
+			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		new_dir->i_nlink++;
 	}
 
-- 
cgit v1.2.3


From d8c76e6f45c111c32a4b3e50a2adc9210737b0d8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Sat, 30 Sep 2006 23:29:04 -0700
Subject: [PATCH] r/o bind mount prepwork: inc_nlink() helper

This is mostly included for parity with dec_nlink(), where we will have some
more hooks.  This one should stay pretty darn straightforward for now.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Acked-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/infiniband/hw/ipath/ipath_fs.c | 4 ++--
 drivers/usb/core/inode.c               | 4 ++--
 fs/9p/vfs_inode.c                      | 2 +-
 fs/autofs/root.c                       | 2 +-
 fs/autofs4/root.c                      | 2 +-
 fs/bfs/dir.c                           | 2 +-
 fs/cifs/inode.c                        | 2 +-
 fs/coda/dir.c                          | 2 +-
 fs/configfs/dir.c                      | 4 ++--
 fs/configfs/mount.c                    | 2 +-
 fs/debugfs/inode.c                     | 4 ++--
 fs/ext3/namei.c                        | 6 +++---
 fs/fuse/control.c                      | 2 +-
 fs/hfsplus/dir.c                       | 2 +-
 fs/hpfs/namei.c                        | 4 ++--
 fs/hugetlbfs/inode.c                   | 4 ++--
 fs/jffs2/dir.c                         | 6 +++---
 fs/jffs2/fs.c                          | 6 +++---
 fs/jfs/namei.c                         | 6 +++---
 fs/libfs.c                             | 4 ++--
 fs/msdos/namei.c                       | 4 ++--
 fs/ocfs2/dlm/dlmfs.c                   | 6 +++---
 fs/ocfs2/namei.c                       | 8 ++++----
 fs/ramfs/inode.c                       | 4 ++--
 fs/reiserfs/namei.c                    | 6 +++---
 fs/sysfs/dir.c                         | 4 ++--
 fs/sysfs/mount.c                       | 2 +-
 fs/udf/inode.c                         | 2 +-
 fs/udf/namei.c                         | 6 +++---
 fs/vfat/namei.c                        | 4 ++--
 include/linux/fs.h                     | 7 ++++++-
 ipc/mqueue.c                           | 2 +-
 kernel/cpuset.c                        | 8 ++++----
 mm/shmem.c                             | 8 ++++----
 net/sunrpc/rpc_pipe.c                  | 6 +++---
 security/inode.c                       | 4 ++--
 security/selinux/selinuxfs.c           | 4 ++--
 37 files changed, 80 insertions(+), 75 deletions(-)

(limited to 'mm')

diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index a507d0b5be6c..d9ff283f725e 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -66,8 +66,8 @@ static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode->i_private = data;
 	if ((mode & S_IFMT) == S_IFDIR) {
 		inode->i_op = &simple_dir_inode_operations;
-		inode->i_nlink++;
-		dir->i_nlink++;
+		inc_nlink(inode);
+		inc_nlink(dir);
 	}
 
 	inode->i_fop = fops;
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 88002e45a6b4..7c77c2d8d300 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -263,7 +263,7 @@ static struct inode *usbfs_get_inode (struct super_block *sb, int mode, dev_t de
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -295,7 +295,7 @@ static int usbfs_mkdir (struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = usbfs_mknod (dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7a7ec2d1d2f4..5241c600ce28 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -233,7 +233,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 			inode->i_op = &v9fs_symlink_inode_operations;
 			break;
 		case S_IFDIR:
-			inode->i_nlink++;
+			inc_nlink(inode);
 			if(v9ses->extended)
 				inode->i_op = &v9fs_dir_inode_operations_ext;
 			else
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 54ad70731927..368a1c33a3c8 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -466,7 +466,7 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ent->dentry = dentry;
 	autofs_hash_insert(dh,ent);
 
-	dir->i_nlink++;
+	inc_nlink(dir);
 	d_instantiate(dentry, iget(dir->i_sb,ino));
 	unlock_kernel();
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 348bec0982b0..e21bb4668201 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -713,7 +713,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
 	ino->inode = inode;
-	dir->i_nlink++;
+	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index ce05d1643dd1..a650f1d0b85e 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -163,7 +163,7 @@ static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new
 		unlock_kernel();
 		return err;
 	}
-	inode->i_nlink++;
+	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 74441a17e186..76b7fb34101a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -735,7 +735,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
 		d_drop(direntry);
 	} else {
-		inode->i_nlink++;
+		inc_nlink(inode);
 		if (pTcon->ses->capabilities & CAP_UNIX)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
 						      inode->i_sb,xid);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 0a2fd8bb7579..0102b28a15fb 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -304,7 +304,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	coda_dir_changed(dir_inode, 0);
 	atomic_inc(&inode->i_count);
 	d_instantiate(de, inode);
-	inode->i_nlink++;
+	inc_nlink(inode);
         
 out:
 	unlock_kernel();
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 816e8ef64560..8a3b6a1a6ad1 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -139,7 +139,7 @@ static int init_dir(struct inode * inode)
 	inode->i_fop = &configfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	return 0;
 }
 
@@ -169,7 +169,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
 	if (!error) {
 		error = configfs_create(d, mode, init_dir);
 		if (!error) {
-			p->d_inode->i_nlink++;
+			inc_nlink(p->d_inode);
 			(d)->d_op = &configfs_dentry_ops;
 		} else {
 			struct configfs_dirent *sd = d->d_fsdata;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3e5fe843e1df..68bd5c93ca52 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -84,7 +84,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 		inode->i_op = &configfs_dir_inode_operations;
 		inode->i_fop = &configfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else {
 		pr_debug("configfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 269e649e6dc6..ecf3da9edf21 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -54,7 +54,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -87,7 +87,7 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = debugfs_mknod(dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 14c55adfae83..b45c88bd5f73 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1616,7 +1616,7 @@ static int ext3_delete_entry (handle_t *handle,
  */
 static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
 {
-	inode->i_nlink++;
+	inc_nlink(inode);
 }
 
 static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
@@ -1775,7 +1775,7 @@ retry:
 		iput (inode);
 		goto out_stop;
 	}
-	dir->i_nlink++;
+	inc_nlink(dir);
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
@@ -2341,7 +2341,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		if (new_inode) {
 			drop_nlink(new_inode);
 		} else {
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 			ext3_update_dx_flag(new_dir);
 			ext3_mark_inode_dirty(handle, new_dir);
 		}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 79ec1f23d4d2..16b39c053d47 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -116,7 +116,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 		return 0;
 
 	parent = fuse_control_sb->s_root;
-	parent->d_inode->i_nlink++;
+	inc_nlink(parent->d_inode);
 	sprintf(name, "%llu", (unsigned long long) fc->id);
 	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
 				     &simple_dir_inode_operations,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 9ceb0dfaa1cc..99b4ed1b87d2 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -298,7 +298,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 	if (res)
 		return res;
 
-	inode->i_nlink++;
+	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
 	atomic_inc(&inode->i_count);
 	inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 4078b0becc5e..25dd6f81eca7 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -89,7 +89,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	brelse(bh);
 	hpfs_mark_4buffers_dirty(&qbh0);
 	hpfs_brelse4(&qbh0);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	insert_inode_hash(result);
 
 	if (result->i_uid != current->fsuid ||
@@ -635,7 +635,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	end:
 	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
 	if (S_ISDIR(i->i_mode)) {
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 		drop_nlink(old_dir);
 	}
 	if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f5b8f329aca6..5e03b2f67b93 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -377,7 +377,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
@@ -418,7 +418,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return retval;
 }
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index a5e9f2205b33..9def6adf4a5d 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -588,7 +588,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
-	dir_i->i_nlink++;
+	inc_nlink(dir_i);
 
 	jffs2_free_raw_dirent(rd);
 
@@ -836,7 +836,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	/* If it was a directory we moved, and there was no victim,
 	   increase i_nlink on its new parent */
 	if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f)
-		new_dir_i->i_nlink++;
+		inc_nlink(new_dir_i);
 
 	/* Unlink the original */
 	ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
@@ -848,7 +848,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		/* Oh shit. We really ought to make a single node which can do both atomically */
 		struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
 		down(&f->sem);
-		old_dentry->d_inode->i_nlink++;
+		inc_nlink(old_dentry->d_inode);
 		if (f->inocache)
 			f->inocache->nlink++;
 		up(&f->sem);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 72d9909d95ff..7bc1a4201c0c 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -277,13 +277,13 @@ void jffs2_read_inode (struct inode *inode)
 
 		for (fd=f->dents; fd; fd = fd->next) {
 			if (fd->type == DT_DIR && fd->ino)
-				inode->i_nlink++;
+				inc_nlink(inode);
 		}
 		/* and '..' */
-		inode->i_nlink++;
+		inc_nlink(inode);
 		/* Root dir gets i_nlink 3 for some reason */
 		if (inode->i_ino == 1)
-			inode->i_nlink++;
+			inc_nlink(inode);
 
 		inode->i_op = &jffs2_dir_inode_operations;
 		inode->i_fop = &jffs2_dir_operations;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 088b85976ac0..8cef88170aa4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	mark_inode_dirty(ip);
 
 	/* update parent directory inode */
-	dip->i_nlink++;		/* for '..' from child directory */
+	inc_nlink(dip);		/* for '..' from child directory */
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dip);
 
@@ -822,7 +822,7 @@ static int jfs_link(struct dentry *old_dentry,
 		goto free_dname;
 
 	/* update object inode */
-	ip->i_nlink++;		/* for new link */
+	inc_nlink(ip);		/* for new link */
 	ip->i_ctime = CURRENT_TIME;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dir);
@@ -1206,7 +1206,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out4;
 		}
 		if (S_ISDIR(old_ip->i_mode))
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 	}
 	/*
 	 * Remove old directory entry
diff --git a/fs/libfs.c b/fs/libfs.c
index 9204feba75ac..bd08e0e64a8c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -243,7 +243,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 	struct inode *inode = old_dentry->d_inode;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink++;
+	inc_nlink(inode);
 	atomic_inc(&inode->i_count);
 	dget(dentry);
 	d_instantiate(dentry, inode);
@@ -306,7 +306,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
 		drop_nlink(old_dir);
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 	}
 
 	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 635613f2f65a..fa868c755907 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -389,7 +389,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
 	if (err)
 		goto out_free;
-	dir->i_nlink++;
+	inc_nlink(dir);
 
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
@@ -551,7 +551,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		}
 		drop_nlink(old_dir);
 		if (!new_inode)
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 	}
 
 	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 0368c6402182..16b8d1ba7066 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -338,7 +338,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_nlink++;
+		inc_nlink(inode);
 
 		inode->i_fop = &simple_dir_operations;
 		inode->i_op = &dlmfs_root_inode_operations;
@@ -395,7 +395,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 		/* directory inodes start off with i_nlink ==
 		 * 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 		break;
 	}
 
@@ -449,7 +449,7 @@ static int dlmfs_mkdir(struct inode * dir,
 	}
 	ip->ip_dlm = dlm;
 
-	dir->i_nlink++;
+	inc_nlink(dir);
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 40f83f53053a..8c370a39e0c1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -429,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
 			mlog_errno(status);
 			goto leave;
 		}
-		dir->i_nlink++;
+		inc_nlink(dir);
 	}
 
 	status = ocfs2_add_entry(handle, dentry, inode,
@@ -730,7 +730,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto bail;
 	}
 
-	inode->i_nlink++;
+	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME;
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -952,7 +952,7 @@ static int ocfs2_unlink(struct inode *dir,
 						parent_node_bh);
 		if (status < 0) {
 			mlog_errno(status);
-			dir->i_nlink++;
+			inc_nlink(dir);
 		}
 	}
 
@@ -1382,7 +1382,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (new_inode) {
 			new_inode->i_nlink--;
 		} else {
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index bc0e51662424..2faf4cdf61b0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -75,7 +75,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
@@ -113,7 +113,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
 	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return retval;
 }
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c76d427e027b..cf92e89515f2 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,7 +19,7 @@
 #include <linux/smp_lock.h>
 #include <linux/quotaops.h>
 
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
 #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
 
 // directory item contains array of entry headers. This performs
@@ -1006,7 +1006,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	    reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
 				   0);
 	if (retval < 0) {
-		inode->i_nlink++;
+		inc_nlink(inode);
 		goto end_unlink;
 	}
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -1143,7 +1143,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
-	inode->i_nlink++;
+	inc_nlink(inode);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5f3d725d1125..3aa3434621ca 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -103,7 +103,7 @@ static int init_dir(struct inode * inode)
 	inode->i_fop = &sysfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	return 0;
 }
 
@@ -137,7 +137,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 		if (!error) {
 			error = sysfs_create(*d, mode, init_dir);
 			if (!error) {
-				p->d_inode->i_nlink++;
+				inc_nlink(p->d_inode);
 				(*d)->d_op = &sysfs_dentry_ops;
 				d_rehash(*d);
 			}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 40190c489271..20551a1b8a56 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -49,7 +49,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;	
+		inc_nlink(inode);
 	} else {
 		pr_debug("sysfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b223b32db991..ae21a0e59e95 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1165,7 +1165,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 			inode->i_op = &udf_dir_inode_operations;
 			inode->i_fop = &udf_dir_operations;
 			inode->i_mode |= S_IFDIR;
-			inode->i_nlink ++;
+			inc_nlink(inode);
 			break;
 		}
 		case ICBTAG_FILE_TYPE_REALTIME:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d14d25534aa8..e40c95e65117 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -762,7 +762,7 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		cpu_to_le32(UDF_I_UNIQUE(inode) & 0x00000000FFFFFFFFUL);
 	cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	mark_inode_dirty(dir);
 	d_instantiate(dentry, inode);
 	if (fibh.sbh != fibh.ebh)
@@ -1147,7 +1147,7 @@ static int udf_link(struct dentry * old_dentry, struct inode * dir,
 	if (fibh.sbh != fibh.ebh)
 		udf_release_data(fibh.ebh);
 	udf_release_data(fibh.sbh);
-	inode->i_nlink ++;
+	inc_nlink(inode);
 	inode->i_ctime = current_fs_time(inode->i_sb);
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
@@ -1282,7 +1282,7 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 		}
 		else
 		{
-			new_dir->i_nlink ++;
+			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 090d74ffa061..5846ba2d5d9f 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -837,7 +837,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_free;
 	dir->i_version++;
-	dir->i_nlink++;
+	inc_nlink(dir);
 
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
@@ -932,7 +932,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 		drop_nlink(old_dir);
 		if (!new_inode)
- 			new_dir->i_nlink++;
+ 			inc_nlink(new_dir);
 	}
 
 	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 26d3c61116c0..6a5267da565f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1219,9 +1219,14 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
-static inline void inode_inc_link_count(struct inode *inode)
+static inline void inc_nlink(struct inode *inode)
 {
 	inode->i_nlink++;
+}
+
+static inline void inode_inc_link_count(struct inode *inode)
+{
+	inc_nlink(inode);
 	mark_inode_dirty(inode);
 }
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 10aa8eeeb112..d75d0ba83360 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -168,7 +168,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 			/* all is ok */
 			info->user = get_uid(u);
 		} else if (S_ISDIR(mode)) {
-			inode->i_nlink++;
+			inc_nlink(inode);
 			/* Some things misbehave if size == 0 on a directory */
 			inode->i_size = 2 * DIRENT_SIZE;
 			inode->i_op = &mqueue_dir_inode_operations;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c3c400cce91..9d850ae13b1b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -377,7 +377,7 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
 		inode->i_op = &simple_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* directories start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else {
 		return -ENOMEM;
 	}
@@ -1565,7 +1565,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
 		inode->i_fop = &simple_dir_operations;
 
 		/* start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cpuset_file_operations;
@@ -1598,7 +1598,7 @@ static int cpuset_create_dir(struct cpuset *cs, const char *name, int mode)
 	error = cpuset_create_file(dentry, S_IFDIR | mode);
 	if (!error) {
 		dentry->d_fsdata = cs;
-		parent->d_inode->i_nlink++;
+		inc_nlink(parent->d_inode);
 		cs->dentry = dentry;
 	}
 	dput(dentry);
@@ -2033,7 +2033,7 @@ int __init cpuset_init(void)
 	}
 	root = cpuset_mount->mnt_sb->s_root;
 	root->d_fsdata = &top_cpuset;
-	root->d_inode->i_nlink++;
+	inc_nlink(root->d_inode);
 	top_cpuset.dentry = root;
 	root->d_inode->i_op = &cpuset_dir_inode_operations;
 	number_of_cpusets = 1;
diff --git a/mm/shmem.c b/mm/shmem.c
index 908dd947b1ea..bb8ca7ef7094 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1379,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 							&sbinfo->policy_nodes);
 			break;
 		case S_IFDIR:
-			inode->i_nlink++;
+			inc_nlink(inode);
 			/* Some things misbehave if size == 0 on a directory */
 			inode->i_size = 2 * BOGO_DIRENT_SIZE;
 			inode->i_op = &shmem_dir_inode_operations;
@@ -1715,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
 		return error;
-	dir->i_nlink++;
+	inc_nlink(dir);
 	return 0;
 }
 
@@ -1750,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink++;
+	inc_nlink(inode);
 	atomic_inc(&inode->i_count);	/* New dentry reference */
 	dget(dentry);		/* Extra pinning count for the created dentry */
 	d_instantiate(dentry, inode);
@@ -1807,7 +1807,7 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
 			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
 		drop_nlink(old_dir);
-		new_dir->i_nlink++;
+		inc_nlink(new_dir);
 	}
 
 	old_dir->i_size -= BOGO_DIRENT_SIZE;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 700c6e061a04..9a0b41a97f90 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -494,7 +494,7 @@ rpc_get_inode(struct super_block *sb, int mode)
 		case S_IFDIR:
 			inode->i_fop = &simple_dir_operations;
 			inode->i_op = &simple_dir_inode_operations;
-			inode->i_nlink++;
+			inc_nlink(inode);
 		default:
 			break;
 	}
@@ -571,7 +571,7 @@ rpc_populate(struct dentry *parent,
 		if (private)
 			rpc_inode_setowner(inode, private);
 		if (S_ISDIR(mode))
-			dir->i_nlink++;
+			inc_nlink(dir);
 		d_add(dentry, inode);
 	}
 	mutex_unlock(&dir->i_mutex);
@@ -593,7 +593,7 @@ __rpc_mkdir(struct inode *dir, struct dentry *dentry)
 		goto out_err;
 	inode->i_ino = iunique(dir->i_sb, 100);
 	d_instantiate(dentry, inode);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	inode_dir_notify(dir, DN_CREATE);
 	return 0;
 out_err:
diff --git a/security/inode.c b/security/inode.c
index 49ee51529396..9b16e14f3a80 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -78,7 +78,7 @@ static struct inode *get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -111,7 +111,7 @@ static int mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = mknod(dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index bab7b386cb8d..cd244419c980 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1253,10 +1253,10 @@ static int sel_make_dir(struct inode *dir, struct dentry *dentry)
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	d_add(dentry, inode);
 	/* bump link count on parent directory, too */
-	dir->i_nlink++;
+	inc_nlink(dir);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From bd4c8ce41a2e2f0c5bf54343ab54e8e09faec021 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 30 Sep 2006 23:29:29 -0700
Subject: [PATCH] invalidate_inode_pages2(): ignore page refcounts

The recent fix to invalidate_inode_pages() (git commit 016eb4a) managed to
unfix invalidate_inode_pages2().

The problem is that various bits of code in the kernel can take transient refs
on pages: the page scanner will do this when inspecting a batch of pages, and
the lru_cache_add() batching pagevecs also hold a ref.

Net result is transient failures in invalidate_inode_pages2().  This affects
NFS directory invalidation (observed) and presumably also block-backed
direct-io (not yet reported).

Fix it by reverting invalidate_inode_pages2() back to the old version which
ignores the page refcounts.

We may come up with something more clever later, but for now we need a 2.6.18
fix for NFS.

Cc: Chuck Lever <cel@citi.umich.edu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/truncate.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 8fde6580657e..f4edbc179d14 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -287,9 +287,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping)
 {
 	return invalidate_mapping_pages(mapping, 0, ~0UL);
 }
-
 EXPORT_SYMBOL(invalidate_inode_pages);
 
+/*
+ * This is like invalidate_complete_page(), except it ignores the page's
+ * refcount.  We do this because invalidate_inode_pages2() needs stronger
+ * invalidation guarantees, and cannot afford to leave pages behind because
+ * shrink_list() has a temp ref on them, or because they're transiently sitting
+ * in the lru_cache_add() pagevecs.
+ */
+static int
+invalidate_complete_page2(struct address_space *mapping, struct page *page)
+{
+	if (page->mapping != mapping)
+		return 0;
+
+	if (PagePrivate(page) && !try_to_release_page(page, 0))
+		return 0;
+
+	write_lock_irq(&mapping->tree_lock);
+	if (PageDirty(page))
+		goto failed;
+
+	BUG_ON(PagePrivate(page));
+	__remove_from_page_cache(page);
+	write_unlock_irq(&mapping->tree_lock);
+	ClearPageUptodate(page);
+	page_cache_release(page);	/* pagecache ref */
+	return 1;
+failed:
+	write_unlock_irq(&mapping->tree_lock);
+	return 0;
+}
+
 /**
  * invalidate_inode_pages2_range - remove range of pages from an address_space
  * @mapping: the address_space
@@ -356,7 +386,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 				}
 			}
 			was_dirty = test_clear_page_dirty(page);
-			if (!invalidate_complete_page(mapping, page)) {
+			if (!invalidate_complete_page2(mapping, page)) {
 				if (was_dirty)
 					set_page_dirty(page);
 				ret = -EIO;
-- 
cgit v1.2.3


From 3dc907951446b9317b1887223caa4e083390de9f Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 30 Sep 2006 23:29:30 -0700
Subject: [PATCH] paravirt: remove read hazard from cow

We don't want to read PTEs directly like this after they have been modified,
as a lazy MMU implementation of direct page tables may not have written the
updated PTE back to memory yet.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 160f5b503ead..7707187484cc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -467,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 */
 	if (is_cow_mapping(vm_flags)) {
 		ptep_set_wrprotect(src_mm, addr, src_pte);
-		pte = *src_pte;
+		pte = pte_wrprotect(pte);
 	}
 
 	/*
-- 
cgit v1.2.3


From 9888a1cae3f859db38b9604e3df1c02177161bb0 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 30 Sep 2006 23:29:31 -0700
Subject: [PATCH] paravirt: pte clear not present

Change pte_clear_full to a more appropriately named pte_clear_not_present,
allowing optimizations when not-present mapping changes need not be reflected
in the hardware TLB for protected page table modes.  There is also another
case that can use it in the fremap code.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/pgtable.h | 9 +++++++--
 mm/fremap.c                   | 2 +-
 mm/memory.c                   | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 349260cd86ed..78740716c9e7 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -112,8 +112,13 @@ do {				  					  \
 })
 #endif
 
-#ifndef __HAVE_ARCH_PTE_CLEAR_FULL
-#define pte_clear_full(__mm, __address, __ptep, __full)			\
+/*
+ * Some architectures may be able to avoid expensive synchronization
+ * primitives when modifications are made to PTE's which are already
+ * not present, or in the process of an address space destruction.
+ */
+#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
+#define pte_clear_not_present_full(__mm, __address, __ptep, __full)	\
 do {									\
 	pte_clear((__mm), (__address), (__ptep));			\
 } while (0)
diff --git a/mm/fremap.c b/mm/fremap.c
index aa30618ec6b2..7a9d0f5d246d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
-		pte_clear(mm, addr, ptep);
+		pte_clear_not_present_full(mm, addr, ptep, 0);
 	}
 	return !!page;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 7707187484cc..2e754621d333 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,7 +690,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(mm, addr, pte, tlb->fullmm);
+		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
-- 
cgit v1.2.3


From 6606c3e0da5360799e07ae24b05080cc85c68e72 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 30 Sep 2006 23:29:33 -0700
Subject: [PATCH] paravirt: lazy mmu mode hooks.patch

Implement lazy MMU update hooks which are SMP safe for both direct and shadow
page tables.  The idea is that PTE updates and page invalidations while in
lazy mode can be batched into a single hypercall.  We use this in VMI for
shadow page table synchronization, and it is a win.  It also can be used by
PPC and for direct page tables on Xen.

For SMP, the enter / leave must happen under protection of the page table
locks for page tables which are being modified.  This is because otherwise,
you end up with stale state in the batched hypercall, which other CPUs can
race ahead of.  Doing this under the protection of the locks guarantees the
synchronization is correct, and also means that spurious faults which are
generated during this window by remote CPUs are properly handled, as the page
fault handler must re-check the PTE under protection of the same lock.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/pgtable.h | 20 ++++++++++++++++++++
 mm/memory.c                   |  8 ++++++++
 mm/mprotect.c                 |  2 ++
 mm/mremap.c                   |  2 ++
 4 files changed, 32 insertions(+)

(limited to 'mm')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 78740716c9e7..56627fa453a6 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -170,6 +170,26 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 #define move_pte(pte, prot, old_addr, new_addr)	(pte)
 #endif
 
+/*
+ * A facility to provide lazy MMU batching.  This allows PTE updates and
+ * page invalidations to be delayed until a call to leave lazy MMU mode
+ * is issued.  Some architectures may benefit from doing this, and it is
+ * beneficial for both shadow and direct mode hypervisors, which may batch
+ * the PTE updates which happen during this window.  Note that using this
+ * interface requires that read hazards be removed from the code.  A read
+ * hazard could result in the direct mode hypervisor case, since the actual
+ * write to the page tables may not yet have taken place, so reads though
+ * a raw PTE pointer after it has been modified are not guaranteed to be
+ * up to date.  This mode can only be entered and left under the protection of
+ * the page table locks for all page tables which may be modified.  In the UP
+ * case, this is required so that preemption is disabled, and in the SMP case,
+ * it must synchronize the delayed page table writes properly on other CPUs.
+ */
+#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+#define arch_enter_lazy_mmu_mode()	do {} while (0)
+#define arch_leave_lazy_mmu_mode()	do {} while (0)
+#endif
+
 /*
  * When walking page tables, get the address of the next boundary,
  * or the end address of the range if that comes earlier.  Although no
diff --git a/mm/memory.c b/mm/memory.c
index 2e754621d333..9cf3f341a28a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ again:
 	src_pte = pte_offset_map_nested(src_pmd, addr);
 	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
 
 	do {
 		/*
@@ -527,6 +528,7 @@ again:
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
+	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	int anon_rss = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		pte_t ptent = *pte;
 		if (pte_none(ptent)) {
@@ -694,6 +697,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
 	add_mm_rss(mm, file_rss, anon_rss);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
 	return addr;
@@ -1109,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		struct page *page = ZERO_PAGE(addr);
 		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1118,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
@@ -1275,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
+	arch_enter_lazy_mmu_mode();
 	do {
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 955f9d0e38aa..3b8f3c0c63f3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -34,6 +34,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	spinlock_t *ptl;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	arch_enter_lazy_mmu_mode();
 	do {
 		oldpte = *pte;
 		if (pte_present(oldpte)) {
@@ -70,6 +71,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		}
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
+	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 }
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 7c15cf3373ad..9c769fa29f32 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	new_ptl = pte_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+	arch_enter_lazy_mmu_mode();
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
 
+	arch_leave_lazy_mmu_mode();
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
-- 
cgit v1.2.3