summaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c64
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/init.c41
-rw-r--r--arch/x86/mm/pageattr.c22
4 files changed, 71 insertions, 58 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d973e61e450d..38dcec403b46 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -844,11 +844,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
unsigned int fault)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int code = BUS_ADRERR;
- up_read(&mm->mmap_sem);
-
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
@@ -879,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
unsigned long address, unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
- up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address, 0, 0);
return;
}
@@ -887,14 +883,11 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
- up_read(&current->mm->mmap_sem);
no_context(regs, error_code, address,
SIGSEGV, SEGV_MAPERR);
return;
}
- up_read(&current->mm->mmap_sem);
-
/*
* We ran out of memory, call the OOM killer, and return the
* userspace (which will retry the fault, or kill us if we got
@@ -1062,7 +1055,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
- int fault;
+ int fault, major = 0;
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
tsk = current;
@@ -1237,47 +1230,50 @@ good_area:
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
+ major |= fault & VM_FAULT_MAJOR;
/*
- * If we need to retry but a fatal signal is pending, handle the
- * signal first. We do not need to release the mmap_sem because it
- * would already be released in __lock_page_or_retry in mm/filemap.c.
+ * If we need to retry the mmap_sem has already been released,
+ * and if there is a fatal signal pending there is no guarantee
+ * that we made any progress. Handle this case first.
*/
- if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* Retry at most once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(tsk))
+ goto retry;
+ }
+
+ /* User mode? Just return to handle the fatal exception */
+ if (flags & FAULT_FLAG_USER)
+ return;
+
+ /* Not returning to user mode? Handle exceptions or die: */
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
+ }
+ up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
return;
}
/*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
+ * Major/minor page fault accounting. If any of the events
+ * returned VM_FAULT_MAJOR, we account it as a major fault.
*/
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ if (major) {
+ tsk->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ } else {
+ tsk->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
check_v8086_mode(regs, address, tsk);
-
- up_read(&mm->mmap_sem);
}
NOKPROBE_SYMBOL(__do_page_fault);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 207d9aef662d..d7547824e763 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -15,7 +15,7 @@
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
- return ACCESS_ONCE(*ptep);
+ return READ_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 82b41d56bb98..08a7d313538a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -438,20 +438,20 @@ static unsigned long __init init_range_memory_mapping(
static unsigned long __init get_new_step_size(unsigned long step_size)
{
/*
- * Explain why we shift by 5 and why we don't have to worry about
- * 'step_size << 5' overflowing:
- *
- * initial mapped size is PMD_SIZE (2M).
+ * Initial mapped size is PMD_SIZE (2M).
* We can not set step_size to be PUD_SIZE (1G) yet.
* In worse case, when we cross the 1G boundary, and
* PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
- * to map 1G range with PTE. Use 5 as shift for now.
+ * to map 1G range with PTE. Hence we use one less than the
+ * difference of page table level shifts.
*
- * Don't need to worry about overflow, on 32bit, when step_size
- * is 0, round_down() returns 0 for start, and that turns it
- * into 0x100000000ULL.
+ * Don't need to worry about overflow in the top-down case, on 32bit,
+ * when step_size is 0, round_down() returns 0 for start, and that
+ * turns it into 0x100000000ULL.
+ * In the bottom-up case, round_up(x, 0) returns 0 though too, which
+ * needs to be taken into consideration by the code below.
*/
- return step_size << 5;
+ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
}
/**
@@ -471,7 +471,6 @@ static void __init memory_map_top_down(unsigned long map_start,
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
- unsigned long new_mapped_ram_size;
/* xen has big range in reserved near end of ram, skip it at first.*/
addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
@@ -496,14 +495,12 @@ static void __init memory_map_top_down(unsigned long map_start,
start = map_start;
} else
start = map_start;
- new_mapped_ram_size = init_range_memory_mapping(start,
+ mapped_ram_size += init_range_memory_mapping(start,
last_start);
last_start = start;
min_pfn_mapped = last_start >> PAGE_SHIFT;
- /* only increase step_size after big range get mapped */
- if (new_mapped_ram_size > mapped_ram_size)
+ if (mapped_ram_size >= step_size)
step_size = get_new_step_size(step_size);
- mapped_ram_size += new_mapped_ram_size;
}
if (real_end < map_end)
@@ -524,7 +521,7 @@ static void __init memory_map_top_down(unsigned long map_start,
static void __init memory_map_bottom_up(unsigned long map_start,
unsigned long map_end)
{
- unsigned long next, new_mapped_ram_size, start;
+ unsigned long next, start;
unsigned long mapped_ram_size = 0;
/* step_size need to be small so pgt_buf from BRK could cover it */
unsigned long step_size = PMD_SIZE;
@@ -539,19 +536,19 @@ static void __init memory_map_bottom_up(unsigned long map_start,
* for page table.
*/
while (start < map_end) {
- if (map_end - start > step_size) {
+ if (step_size && map_end - start > step_size) {
next = round_up(start + 1, step_size);
if (next > map_end)
next = map_end;
- } else
+ } else {
next = map_end;
+ }
- new_mapped_ram_size = init_range_memory_mapping(start, next);
+ mapped_ram_size += init_range_memory_mapping(start, next);
start = next;
- if (new_mapped_ram_size > mapped_ram_size)
+ if (mapped_ram_size >= step_size)
step_size = get_new_step_size(step_size);
- mapped_ram_size += new_mapped_ram_size;
}
}
@@ -703,10 +700,10 @@ void __init zone_sizes_init(void)
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+ max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+ max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a3a5d46605d2..536ea2fb6e33 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
}
/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ return NULL;
+
+ return pmd_offset(pud, address);
+}
+
+/*
* This is necessary because __pa() does not work on some
* kinds of memory, like vmalloc() or the alloc_remap()
* areas on 32-bit NUMA systems. The percpu areas can
@@ -1817,7 +1837,7 @@ static int __set_pages_np(struct page *page, int numpages)
return __change_page_attr_set_clr(&cpa, 0);
}
-void kernel_map_pages(struct page *page, int numpages, int enable)
+void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (PageHighMem(page))
return;
OpenPOWER on IntegriCloud