diff options
Diffstat (limited to 'freed-ora')
-rw-r--r-- | freed-ora/current/f15/hugetlb-fix-resv_map-leak-in-error-path.patch | 176 | ||||
-rw-r--r-- | freed-ora/current/f15/mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_populate-SMP-race-condition.patch | 272 |
2 files changed, 448 insertions, 0 deletions
diff --git a/freed-ora/current/f15/hugetlb-fix-resv_map-leak-in-error-path.patch b/freed-ora/current/f15/hugetlb-fix-resv_map-leak-in-error-path.patch new file mode 100644 index 000000000..888d5ce70 --- /dev/null +++ b/freed-ora/current/f15/hugetlb-fix-resv_map-leak-in-error-path.patch @@ -0,0 +1,176 @@ +From c50ac050811d6485616a193eb0f37bfbd191cc89 Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave@linux.vnet.ibm.com> +Date: Tue, 29 May 2012 15:06:46 -0700 +Subject: [PATCH] hugetlb: fix resv_map leak in error path + +When called for anonymous (non-shared) mappings, hugetlb_reserve_pages() +does a resv_map_alloc(). It depends on code in hugetlbfs's +vm_ops->close() to release that allocation. + +However, in the mmap() failure path, we do a plain unmap_region() without +the remove_vma() which actually calls vm_ops->close(). + +This is a decent fix. This leak could get reintroduced if new code (say, +after hugetlb_reserve_pages() in hugetlbfs_file_mmap()) decides to return +an error. But, I think it would have to unroll the reservation anyway. + +Christoph's test case: + + http://marc.info/?l=linux-mm&m=133728900729735 + +This patch applies to 3.4 and later. A version for earlier kernels is at +https://lkml.org/lkml/2012/5/22/418. + +Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com> +Acked-by: Mel Gorman <mel@csn.ul.ie> +Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> +Reported-by: Christoph Lameter <cl@linux.com> +Tested-by: Christoph Lameter <cl@linux.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: <stable@vger.kernel.org> [2.6.32+] +Signed-off-by: Andrew Morton <akpm@linux-foundation.org> +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +--- + mm/hugetlb.c | 28 ++++++++++++++++++++++------ + 1 files changed, 22 insertions(+), 6 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 41a647d..285a81e 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) + kref_get(&reservations->refs); + } + ++static void resv_map_put(struct vm_area_struct *vma) ++{ ++ struct resv_map *reservations = vma_resv_map(vma); ++ ++ if (!reservations) ++ return; ++ kref_put(&reservations->refs, resv_map_release); ++} ++ + static void hugetlb_vm_op_close(struct vm_area_struct *vma) + { + struct hstate *h = hstate_vma(vma); +@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) + reserve = (end - start) - + region_count(&reservations->regions, start, end); + +- kref_put(&reservations->refs, resv_map_release); ++ resv_map_put(vma); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); +@@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode, + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + +- if (chg < 0) +- return chg; ++ if (chg < 0) { ++ ret = chg; ++ goto out_err; ++ } + + /* There must be enough pages in the subpool for the mapping */ +- if (hugepage_subpool_get_pages(spool, chg)) +- return -ENOSPC; ++ if (hugepage_subpool_get_pages(spool, chg)) { ++ ret = -ENOSPC; ++ goto out_err; ++ } + + /* + * Check enough hugepages are available for the reservation. +@@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode, + ret = hugetlb_acct_memory(h, chg); + if (ret < 0) { + hugepage_subpool_put_pages(spool, chg); +- return ret; ++ goto out_err; + } + + /* +@@ -3022,6 +3035,9 @@ int hugetlb_reserve_pages(struct inode *inode, + if (!vma || vma->vm_flags & VM_MAYSHARE) + region_add(&inode->i_mapping->private_list, from, to); + return 0; ++out_err: ++ resv_map_put(vma); ++ return ret; + } + + void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +-- +1.7.7.6 + +From 4523e1458566a0e8ecfaff90f380dd23acc44d27 Mon Sep 17 00:00:00 2001 +From: Dave Hansen <dave@linux.vnet.ibm.com> +Date: Wed, 30 May 2012 07:51:07 -0700 +Subject: [PATCH] mm: fix vma_resv_map() NULL pointer + +hugetlb_reserve_pages() can be used for either normal file-backed +hugetlbfs mappings, or MAP_HUGETLB. In the MAP_HUGETLB, semi-anonymous +mode, there is not a VMA around. The new call to resv_map_put() assumed +that there was, and resulted in a NULL pointer dereference: + + BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 + IP: vma_resv_map+0x9/0x30 + PGD 141453067 PUD 1421e1067 PMD 0 + Oops: 0000 [#1] PREEMPT SMP + ... + Pid: 14006, comm: trinity-child6 Not tainted 3.4.0+ #36 + RIP: vma_resv_map+0x9/0x30 + ... + Process trinity-child6 (pid: 14006, threadinfo ffff8801414e0000, task ffff8801414f26b0) + Call Trace: + resv_map_put+0xe/0x40 + hugetlb_reserve_pages+0xa6/0x1d0 + hugetlb_file_setup+0x102/0x2c0 + newseg+0x115/0x360 + ipcget+0x1ce/0x310 + sys_shmget+0x5a/0x60 + system_call_fastpath+0x16/0x1b + +This was reported by Dave Jones, but was reproducible with the +libhugetlbfs test cases, so shame on me for not running them in the +first place. + +With this, the oops is gone, and the output of libhugetlbfs's +run_tests.py is identical to plain 3.4 again. + +[ Marked for stable, since this was introduced by commit c50ac050811d + ("hugetlb: fix resv_map leak in error path") which was also marked for + stable ] + +Reported-by: Dave Jones <davej@redhat.com> +Cc: Mel Gorman <mel@csn.ul.ie> +Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> +Cc: Christoph Lameter <cl@linux.com> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: <stable@vger.kernel.org> [2.6.32+] +Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> +--- + mm/hugetlb.c | 3 ++- + 1 files changed, 2 insertions(+), 1 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 285a81e..e198831 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3036,7 +3036,8 @@ int hugetlb_reserve_pages(struct inode *inode, + region_add(&inode->i_mapping->private_list, from, to); + return 0; + out_err: +- resv_map_put(vma); ++ if (vma) ++ resv_map_put(vma); + return ret; + } + +-- +1.7.7.6 + diff --git a/freed-ora/current/f15/mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_populate-SMP-race-condition.patch b/freed-ora/current/f15/mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_populate-SMP-race-condition.patch new file mode 100644 index 000000000..49ff98a62 --- /dev/null +++ b/freed-ora/current/f15/mm-pmd_read_atomic-fix-32bit-PAE-pmd-walk-vs-pmd_populate-SMP-race-condition.patch @@ -0,0 +1,272 @@ +Path: news.gmane.org!not-for-mail +From: Andrea Arcangeli <aarcange@redhat.com> +Newsgroups: gmane.linux.kernel.mm +Subject: [PATCH] mm: pmd_read_atomic: fix 32bit PAE pmd walk vs pmd_populate SMP race condition +Date: Thu, 24 May 2012 01:39:01 +0200 +Lines: 208 +Approved: news@gmane.org +Message-ID: <1337816341-30743-1-git-send-email-aarcange@redhat.com> +References: <20120518230028.GF32479@redhat.com> +NNTP-Posting-Host: plane.gmane.org +X-Trace: dough.gmane.org 1337816354 18906 80.91.229.3 (23 May 2012 23:39:14 GMT) +X-Complaints-To: usenet@dough.gmane.org +NNTP-Posting-Date: Wed, 23 May 2012 23:39:14 +0000 (UTC) +Cc: Andrew Morton <akpm@linux-foundation.org>, Mel Gorman <mgorman@suse.de>, + Hugh Dickins <hughd@google.com>, Larry Woodman <lwoodman@redhat.com>, + Petr Matousek <pmatouse@redhat.com>, + Ulrich Obergfell <uobergfe@redhat.com>, Rik van Riel <riel@redhat.com> +To: linux-mm@kvack.org +Original-X-From: owner-linux-mm@kvack.org Thu May 24 01:39:12 2012 +Return-path: <owner-linux-mm@kvack.org> +Envelope-to: glkm-linux-mm-2@m.gmane.org +Original-Received: from kanga.kvack.org ([205.233.56.17]) + by plane.gmane.org with esmtp (Exim 4.69) + (envelope-from <owner-linux-mm@kvack.org>) + id 1SXL94-0002ub-3P + for glkm-linux-mm-2@m.gmane.org; Thu, 24 May 2012 01:39:10 +0200 +Original-Received: by kanga.kvack.org (Postfix) + id 1684A6B0083; Wed, 23 May 2012 19:39:09 -0400 (EDT) +Delivered-To: linux-mm-outgoing@kvack.org +Original-Received: by kanga.kvack.org (Postfix, from userid 40) + id 080DD6B0092; Wed, 23 May 2012 19:39:08 -0400 (EDT) +X-Original-To: int-list-linux-mm@kvack.org +Delivered-To: int-list-linux-mm@kvack.org +Original-Received: by kanga.kvack.org (Postfix, from userid 63042) + id C84046B00E7; Wed, 23 May 2012 19:39:08 -0400 (EDT) +X-Original-To: linux-mm@kvack.org +Delivered-To: linux-mm@kvack.org +Original-Received: from psmtp.com (na3sys010amx119.postini.com [74.125.245.119]) + by kanga.kvack.org (Postfix) with SMTP id 0B2DC6B0083 + for <linux-mm@kvack.org>; Wed, 23 May 2012 19:39:07 -0400 (EDT) +Original-Received: from mx1.redhat.com ([209.132.183.28]) (using TLSv1) by na3sys010amx119.postini.com ([74.125.244.10]) with SMTP; + Wed, 23 May 2012 18:39:08 CDT +Original-Received: from int-mx12.intmail.prod.int.phx2.redhat.com (int-mx12.intmail.prod.int.phx2.redhat.com [10.5.11.25]) + by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id q4NNd3dP002492 + (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK); + Wed, 23 May 2012 19:39:03 -0400 +Original-Received: from random.random (ovpn-113-72.phx2.redhat.com [10.3.113.72]) + by int-mx12.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id q4NNd1P7012233; + Wed, 23 May 2012 19:39:02 -0400 +In-Reply-To: <20120518230028.GF32479@redhat.com> +X-Scanned-By: MIMEDefang 2.68 on 10.5.11.25 +X-pstn-neptune: 0/0/0.00/0 +X-pstn-levels: (S:99.90000/99.90000 CV:99.9000 FC:95.5390 LC:95.5390 R:95.9108 P:95.9108 M:97.0282 C:98.6951 ) +X-pstn-dkim: 0 skipped:not-enabled +X-pstn-settings: 3 (1.0000:1.0000) s cv gt3 gt2 gt1 r p m c +X-pstn-addresses: from <aarcange@redhat.com> [db-null] +X-Bogosity: Ham, tests=bogofilter, spamicity=0.000000, version=1.2.2 +Original-Sender: owner-linux-mm@kvack.org +Precedence: bulk +X-Loop: owner-majordomo@kvack.org +List-ID: <linux-mm.kvack.org> +Xref: news.gmane.org gmane.linux.kernel.mm:78936 +Archived-At: <http://permalink.gmane.org/gmane.linux.kernel.mm/78936> + +When holding the mmap_sem for reading, pmd_offset_map_lock should only +run on a pmd_t that has been read atomically from the pmdp +pointer, otherwise we may read only half of it leading to this crash. + +PID: 11679 TASK: f06e8000 CPU: 3 COMMAND: "do_race_2_panic" + #0 [f06a9dd8] crash_kexec at c049b5ec + #1 [f06a9e2c] oops_end at c083d1c2 + #2 [f06a9e40] no_context at c0433ded + #3 [f06a9e64] bad_area_nosemaphore at c043401a + #4 [f06a9e6c] __do_page_fault at c0434493 + #5 [f06a9eec] do_page_fault at c083eb45 + #6 [f06a9f04] error_code (via page_fault) at c083c5d5 + EAX: 01fb470c EBX: fff35000 ECX: 00000003 EDX: 00000100 EBP: + 00000000 + DS: 007b ESI: 9e201000 ES: 007b EDI: 01fb4700 GS: 00e0 + CS: 0060 EIP: c083bc14 ERR: ffffffff EFLAGS: 00010246 + #7 [f06a9f38] _spin_lock at c083bc14 + #8 [f06a9f44] sys_mincore at c0507b7d + #9 [f06a9fb0] system_call at c083becd + start len + EAX: ffffffda EBX: 9e200000 ECX: 00001000 EDX: 6228537f + DS: 007b ESI: 00000000 ES: 007b EDI: 003d0f00 + SS: 007b ESP: 62285354 EBP: 62285388 GS: 0033 + CS: 0073 EIP: 00291416 ERR: 000000da EFLAGS: 00000286 + +This should be a longstanding bug affecting x86 32bit PAE without +THP. Only archs with 64bit large pmd_t and 32bit unsigned long should +be affected. + +With THP enabled the barrier() in +pmd_none_or_trans_huge_or_clear_bad() would partly hide the bug when +the pmd transition from none to stable, by forcing a re-read of the +*pmd in pmd_offset_map_lock, but when THP is enabled a new set of +problem arises by the fact could then transition freely in any of the +none, pmd_trans_huge or pmd_trans_stable states. So making the barrier +in pmd_none_or_trans_huge_or_clear_bad() unconditional isn't good idea +and it would be a flakey solution. + +This should be fully fixed by introducing a pmd_read_atomic that reads +the pmd in order with THP disabled, or by reading the pmd atomically +with cmpxchg8b with THP enabled. + +Luckily this new race condition only triggers in the places that must +already be covered by pmd_none_or_trans_huge_or_clear_bad() so the fix +is localized there but this bug is not related to THP. + +NOTE: this can trigger on x86 32bit systems with PAE enabled with more +than 4G of ram, otherwise the high part of the pmd will never risk to +be truncated because it would be zero at all times, in turn so hiding +the SMP race. + +This bug was discovered and fully debugged by Ulrich, quote: + +---- +[..] +pmd_none_or_trans_huge_or_clear_bad() loads the content of edx and +eax. + + 496 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t + *pmd) + 497 { + 498 /* depend on compiler for an atomic pmd read */ + 499 pmd_t pmdval = *pmd; + + // edi = pmd pointer +0xc0507a74 <sys_mincore+548>: mov 0x8(%esp),%edi +... + // edx = PTE page table high address +0xc0507a84 <sys_mincore+564>: mov 0x4(%edi),%edx +... + // eax = PTE page table low address +0xc0507a8e <sys_mincore+574>: mov (%edi),%eax + +[..] + +Please note that the PMD is not read atomically. These are two "mov" +instructions where the high order bits of the PMD entry are fetched +first. Hence, the above machine code is prone to the following race. + +- The PMD entry {high|low} is 0x0000000000000000. + The "mov" at 0xc0507a84 loads 0x00000000 into edx. + +- A page fault (on another CPU) sneaks in between the two "mov" + instructions and instantiates the PMD. + +- The PMD entry {high|low} is now 0x00000003fda38067. + The "mov" at 0xc0507a8e loads 0xfda38067 into eax. +---- + +Reported-by: Ulrich Obergfell <uobergfe@redhat.com> +Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> +--- + arch/x86/include/asm/pgtable-3level.h | 50 +++++++++++++++++++++++++++++++++ + include/asm-generic/pgtable.h | 22 +++++++++++++- + 2 files changed, 70 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h +index effff47..43876f1 100644 +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) + ptep->pte_low = pte.pte_low; + } + ++#define pmd_read_atomic pmd_read_atomic ++/* ++ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with ++ * a "*pmdp" dereference done by gcc. Problem is, in certain places ++ * where pte_offset_map_lock is called, concurrent page faults are ++ * allowed, if the mmap_sem is hold for reading. An example is mincore ++ * vs page faults vs MADV_DONTNEED. On the page fault side ++ * pmd_populate rightfully does a set_64bit, but if we're reading the ++ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen ++ * because gcc will not read the 64bit of the pmd atomically. To fix ++ * this all places running pmd_offset_map_lock() while holding the ++ * mmap_sem in read mode, shall read the pmdp pointer using this ++ * function to know if the pmd is null nor not, and in turn to know if ++ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd ++ * operations. ++ * ++ * Without THP if the mmap_sem is hold for reading, the ++ * pmd can only transition from null to not null while pmd_read_atomic runs. ++ * So there's no need of literally reading it atomically. ++ * ++ * With THP if the mmap_sem is hold for reading, the pmd can become ++ * THP or null or point to a pte (and in turn become "stable") at any ++ * time under pmd_read_atomic, so it's mandatory to read it atomically ++ * with cmpxchg8b. ++ */ ++#ifndef CONFIG_TRANSPARENT_HUGEPAGE ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ pmdval_t ret; ++ u32 *tmp = (u32 *)pmdp; ++ ++ ret = (pmdval_t) (*tmp); ++ if (ret) { ++ /* ++ * If the low part is null, we must not read the high part ++ * or we can end up with a partial pmd. ++ */ ++ smp_rmb(); ++ ret |= ((pmdval_t)*(tmp + 1)) << 32; ++ } ++ ++ return (pmd_t) { ret }; ++} ++#else /* CONFIG_TRANSPARENT_HUGEPAGE */ ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; ++} ++#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ ++ + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) + { + set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); +diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +index 125c54e..fa596d9 100644 +--- a/include/asm-generic/pgtable.h ++++ b/include/asm-generic/pgtable.h +@@ -446,6 +446,18 @@ static inline int pmd_write(pmd_t pmd) + #endif /* __HAVE_ARCH_PMD_WRITE */ + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + ++#ifndef pmd_read_atomic ++static inline pmd_t pmd_read_atomic(pmd_t *pmdp) ++{ ++ /* ++ * Depend on compiler for an atomic pmd read. NOTE: this is ++ * only going to work, if the pmdval_t isn't larger than ++ * an unsigned long. ++ */ ++ return *pmdp; ++} ++#endif ++ + /* + * This function is meant to be used by sites walking pagetables with + * the mmap_sem hold in read mode to protect against MADV_DONTNEED and +@@ -459,11 +471,17 @@ static inline int pmd_write(pmd_t pmd) + * undefined so behaving like if the pmd was none is safe (because it + * can return none anyway). The compiler level barrier() is critically + * important to compute the two checks atomically on the same pmdval. ++ * ++ * For 32bit kernels with a 64bit large pmd_t this automatically takes ++ * care of reading the pmd atomically to avoid SMP race conditions ++ * against pmd_populate() when the mmap_sem is hold for reading by the ++ * caller (a special atomic read not done by "gcc" as in the generic ++ * version above, is also needed when THP is disabled because the page ++ * fault can populate the pmd from under us). + */ + static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) + { +- /* depend on compiler for an atomic pmd read */ +- pmd_t pmdval = *pmd; ++ pmd_t pmdval = pmd_read_atomic(pmd); + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. + +-- +To unsubscribe, send a message with 'unsubscribe linux-mm' in +the body to majordomo@kvack.org. For more info on Linux MM, +see: http://www.linux-mm.org/ . +Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ +Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> + |