diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 13:47:29 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-21 13:47:29 -0700 |
commit | c3b86a29429dac1033e3f602f51fa8d00006a8eb (patch) | |
tree | bcedd0a553ca2396eeb58318ef6ee6b426e83652 /arch/x86/mm | |
parent | 8d8d2e9ccd331a1345c88b292ebee9d256fd8749 (diff) | |
parent | 2aeb66d3036dbafc297ac553a257a40283dadb3e (diff) | |
download | talos-obmc-linux-c3b86a29429dac1033e3f602f51fa8d00006a8eb.tar.gz talos-obmc-linux-c3b86a29429dac1033e3f602f51fa8d00006a8eb.zip |
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86-32, percpu: Correct the ordering of the percpu readmostly section
x86, mm: Enable ARCH_DMA_ADDR_T_64BIT with X86_64 || HIGHMEM64G
x86: Spread tlb flush vector between nodes
percpu: Introduce a read-mostly percpu API
x86, mm: Fix incorrect data type in vmalloc_sync_all()
x86, mm: Hold mm->page_table_lock while doing vmalloc_sync
x86, mm: Fix bogus whitespace in sync_global_pgds()
x86-32: Fix sparse warning for the __PHYSICAL_MASK calculation
x86, mm: Add RESERVE_BRK_ARRAY() helper
mm, x86: Saving vmcore with non-lazy freeing of vmas
x86, kdump: Change copy_oldmem_page() to use cached addressing
x86, mm: fix uninitialized addr in kernel_physical_mapping_init()
x86, kmemcheck: Remove double test
x86, mm: Make spurious_fault check explicitly check the PRESENT bit
x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes
x86, mm: Separate x86_64 vmalloc_sync_all() into separate functions
x86, mm: Avoid unnecessary TLB flush
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/fault.c | 43 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 47 | ||||
-rw-r--r-- | arch/x86/mm/kmemcheck/opcode.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 20 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 48 |
5 files changed, 129 insertions, 31 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24c6cfdccc4..79b0b372d2d0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -229,7 +229,16 @@ void vmalloc_sync_all(void) spin_lock_irqsave(&pgd_lock, flags); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + spinlock_t *pgt_lock; + pmd_t *ret; + + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) break; } spin_unlock_irqrestore(&pgd_lock, flags); @@ -328,29 +337,7 @@ out: void vmalloc_sync_all(void) { - unsigned long address; - - for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; - address += PGDIR_SIZE) { - - const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - - spin_lock_irqsave(&pgd_lock, flags); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock_irqrestore(&pgd_lock, flags); - } + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* @@ -898,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (pmd_large(*pmd)) return spurious_fault_check(error_code, (pte_t *) pmd); + /* + * Note: don't use pte_present() here, since it returns true + * if the _PAGE_PROTNONE bit is set. However, this aliases the + * _PAGE_GLOBAL bit, which for kernel pages give false positives + * when CONFIG_DEBUG_PAGEALLOC is used. + */ pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) + if (!(pte_flags(*pte) & _PAGE_PRESENT)) return 0; ret = spurious_fault_check(error_code, pte); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c48ad4faca3..c55f900fbf89 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + */ +void sync_global_pgds(unsigned long start, unsigned long end) +{ + unsigned long address; + + for (address = start; address <= end; address += PGDIR_SIZE) { + const pgd_t *pgd_ref = pgd_offset_k(address); + unsigned long flags; + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) + != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); + } + spin_unlock_irqrestore(&pgd_lock, flags); + } +} + +/* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. */ @@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - + bool pgd_changed = false; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); + addr = start; for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start, spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = true; } + + if (pgd_changed) + sync_global_pgds(addr, end); + __flush_tlb_all(); return last_map_addr; @@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) } } + sync_global_pgds((unsigned long)start_page, end); return 0; } diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b) b == 0xf0 || b == 0xf2 || b == 0xf3 /* Group 2 */ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e + || b == 0x64 || b == 0x65 /* Group 3 */ || b == 0x66 /* Group 4 */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..c70e57dbb491 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -105,8 +117,10 @@ static void pgd_ctor(pgd_t *pgd) } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) @@ -272,7 +286,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) */ spin_lock_irqsave(&pgd_lock, flags); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); spin_unlock_irqrestore(&pgd_lock, flags); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -5,6 +5,7 @@ #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/cpu.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -52,6 +53,8 @@ union smp_flush_state { want false sharing in the per cpu data segment. */ static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; +static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); + /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. @@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, union smp_flush_state *f; /* Caller has disabled preemption */ - sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + sender = this_cpu_read(tlb_vector_offset); f = &flush_state[sender]; /* @@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask, flush_tlb_others_ipi(cpumask, mm, va); } +static void __cpuinit calculate_tlb_offset(void) +{ + int cpu, node, nr_node_vecs; + /* + * we are changing tlb_vector_offset for each CPU in runtime, but this + * will not cause inconsistency, as the write is atomic under X86. we + * might see more lock contentions in a short time, but after all CPU's + * tlb_vector_offset are changed, everything should go normal + * + * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might + * waste some vectors. + **/ + if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) + nr_node_vecs = 1; + else + nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; + + for_each_online_node(node) { + int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * + nr_node_vecs; + int cpu_offset = 0; + for_each_cpu(cpu, cpumask_of_node(node)) { + per_cpu(tlb_vector_offset, cpu) = node_offset + + cpu_offset; + cpu_offset++; + cpu_offset = cpu_offset % nr_node_vecs; + } + } +} + +static int tlb_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + switch (action & 0xf) { + case CPU_ONLINE: + case CPU_DEAD: + calculate_tlb_offset(); + } + return NOTIFY_OK; +} + static int __cpuinit init_smp_flush(void) { int i; @@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void) for (i = 0; i < ARRAY_SIZE(flush_state); i++) raw_spin_lock_init(&flush_state[i].tlbstate_lock); + calculate_tlb_offset(); + hotcpu_notifier(tlb_cpuhp_notify, 0); return 0; } core_initcall(init_smp_flush); |