diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/fault.c | 47 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 49 | ||||
-rw-r--r-- | arch/x86/mm/k8topology_64.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/kmemcheck/kmemcheck.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/kmemcheck/opcode.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 24 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 48 |
10 files changed, 148 insertions, 46 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..79b0b372d2d0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,7 +229,16 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + spinlock_t *pgt_lock; + pmd_t *ret; + + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -251,6 +260,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Synchronize this task's top level page-table * with the 'reference' page table. @@ -326,29 +337,7 @@ out: void vmalloc_sync_all(void) { - unsigned long address; - - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* @@ -369,6 +358,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later @@ -894,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); + /* + * Note: don't use pte_present() here, since it returns true + * if the _PAGE_PROTNONE bit is set. However, this aliases the + * _PAGE_GLOBAL bit, which for kernel pages give false positives + * when CONFIG_DEBUG_PAGEALLOC is used. + */ pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) + if (!(pte_flags(*pte) & _PAGE_PRESENT)) return 0; ret = spurious_fault_check(error_code, pte); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d6..558f2d332076 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,7 +67,7 @@ static __init void *alloc_low_page(void) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); return adr; } @@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE] static inline void save_pg_dir(void) { - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); + copy_page(swsusp_pg_dir, swapper_pg_dir); } #else /* !CONFIG_ACPI_SLEEP */ static inline void save_pg_dir(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..c55f900fbf89 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. */ @@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } @@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - + bool pgd_changed = false; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start, spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } + + if (pgd_changed) + sync_global_pgds(addr, end); + __flush_tlb_all(); return last_map_addr; @@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) } } + sync_global_pgds((unsigned long)start_page, end); return 0; } diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 970ed579d4e4..52d54bfc1ebb 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -22,7 +22,7 @@ #include <asm/numa.h> #include <asm/mpspec.h> #include <asm/apic.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> static struct bootnode __initdata nodes[8]; static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; @@ -54,8 +54,8 @@ static __init int find_northbridge(void) static __init void early_get_boot_cpu_id(void) { /* - * need to get boot_cpu_id so can use that to create apicid_to_node - * in k8_scan_nodes() + * need to get the APIC ID of the BSP so can use that to + * create apicid_to_node in k8_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -212,7 +212,7 @@ int __init k8_scan_nodes(void) bits = boot_cpu_data.x86_coreid_bits; cores = (1<<bits); apicid_base = 0; - /* need to get boot_cpu_id early for system with apicid lifting */ + /* get the APIC ID of the BSP early for systems with apicid lifting */ early_get_boot_cpu_id(); if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b3b531a4f8e5..d87dd6d042d6 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, if (!pte) return false; + WARN_ON_ONCE(in_nmi()); + if (error_code & 2) kmemcheck_access(regs, address, KMEMCHECK_WRITE); else diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b) b == 0xf0 || b == 0xf2 || b == 0xf3 /* Group 2 */ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e + || b == 0x64 || b == 0x65 /* Group 3 */ || b == 0x66 /* Group 4 */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a7bcc23ef96c..4962f1aeda6f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -18,7 +18,7 @@ #include <asm/dma.h> #include <asm/numa.h> #include <asm/acpi.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..8be8c7d7bc89 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); - paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) @@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) */ spin_lock_irqsave(&pgd_lock, flags); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); spin_unlock_irqrestore(&pgd_lock, flags); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef1..9c0d0d399c30 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - for_each_node_mask(i, nodes_parsed) - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + for (i = 0; i < num_node_memblks; i++) + e820_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(nodes)) { diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -5,6 +5,7 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/cpu.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -52,6 +53,8 @@ union smp_flush_state { want false sharing in the per cpu data segment. */ static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; +static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); + /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. @@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, union smp_flush_state *f; /* Caller has disabled preemption */ - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + sender = this_cpu_read(tlb_vector_offset); f = &flush_state[sender]; /* @@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask, flush_tlb_others_ipi(cpumask, mm, va); } +static void __cpuinit calculate_tlb_offset(void) +{ + int cpu, node, nr_node_vecs; + /* + * we are changing tlb_vector_offset for each CPU in runtime, but this + * will not cause inconsistency, as the write is atomic under X86. we + * might see more lock contentions in a short time, but after all CPU's + * tlb_vector_offset are changed, everything should go normal + * + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might + * waste some vectors. + **/ + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) + nr_node_vecs = 1; + else + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; + + for_each_online_node(node) { + int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * + nr_node_vecs; + int cpu_offset = 0; + for_each_cpu(cpu, cpumask_of_node(node)) { + per_cpu(tlb_vector_offset, cpu) = node_offset + + cpu_offset; + cpu_offset++; + cpu_offset = cpu_offset % nr_node_vecs; + } + } +} + +static int tlb_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + switch (action & 0xf) { + case CPU_ONLINE: + case CPU_DEAD: + calculate_tlb_offset(); + } + return NOTIFY_OK; +} + static int __cpuinit init_smp_flush(void) { int i; @@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void) for (i = 0; i < ARRAY_SIZE(flush_state); i++) raw_spin_lock_init(&flush_state[i].tlbstate_lock); + calculate_tlb_offset(); + hotcpu_notifier(tlb_cpuhp_notify, 0); return 0; } core_initcall(init_smp_flush); |