diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/cpu_entry_area.c | 33 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/mmap.c | 2 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 16 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 205 |
6 files changed, 121 insertions, 164 deletions
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index b45f5aaefd74..076ebdce9bd4 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -2,6 +2,8 @@ #include <linux/spinlock.h> #include <linux/percpu.h> +#include <linux/kallsyms.h> +#include <linux/kcore.h> #include <asm/cpu_entry_area.h> #include <asm/pgtable.h> @@ -13,6 +15,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage) #ifdef CONFIG_X86_64 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU(struct kcore_list, kcore_entry_trampoline); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -146,10 +149,40 @@ static void __init setup_cpu_entry_area(int cpu) cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); + /* + * The cpu_entry_area alias addresses are not in the kernel binary + * so they do not show up in /proc/kcore normally. This adds entries + * for them manually. + */ + kclist_add_remap(&per_cpu(kcore_entry_trampoline, cpu), + _entry_trampoline, + &get_cpu_entry_area(cpu)->entry_trampoline, PAGE_SIZE); #endif percpu_setup_debug_store(cpu); } +#ifdef CONFIG_X86_64 +int arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name) +{ + unsigned int cpu, ncpu = 0; + + if (symnum >= num_possible_cpus()) + return -EINVAL; + + for_each_possible_cpu(cpu) { + if (ncpu++ >= symnum) + break; + } + + *value = (unsigned long)&get_cpu_entry_area(cpu)->entry_trampoline; + *type = 't'; + strlcpy(name, "__entry_SYSCALL_64_trampoline", KSYM_NAME_LEN); + + return 0; +} +#endif + static __init void setup_cpu_entry_area_ptes(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index acfab322fbe0..7a8fc26c1115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -99,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num) } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_pages: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + unsigned long ret = 0; + + if (min_pfn_mapped < max_pfn_mapped) { + ret = memblock_find_in_range( + min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); + } + if (ret) + memblock_reserve(ret, PAGE_SIZE * num); + else if (can_use_brk_pgt) + ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); + if (!ret) panic("alloc_low_pages: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; } else { pfn = pgt_buf_end; @@ -923,7 +930,7 @@ unsigned long max_swapfile_size(void) if (boot_cpu_has_bug(X86_BUG_L1TF)) { /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ - unsigned long l1tf_limit = l1tf_pfn_limit() + 1; + unsigned long long l1tf_limit = l1tf_pfn_limit(); /* * We encode swap offsets also with 3 bits below those for pfn * which makes the usable limit higher. @@ -931,7 +938,7 @@ unsigned long max_swapfile_size(void) #if CONFIG_PGTABLE_LEVELS > 2 l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; #endif - pages = min_t(unsigned long, l1tf_limit, pages); + pages = min_t(unsigned long long, l1tf_limit, pages); } return pages; } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index f40ab8185d94..1e95d57760cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -257,7 +257,7 @@ bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) /* If it's real memory always allow */ if (pfn_valid(pfn)) return true; - if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) + if (pfn >= l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN)) return false; return true; } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1555bd7d3449..3d0c83ef6aab 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -512,6 +512,17 @@ static int free_ram_pages_type(u64 start, u64 end) return 0; } +static u64 sanitize_phys(u64 address) +{ + /* + * When changing the memtype for pages containing poison allow + * for a "decoy" virtual address (bit 63 clear) passed to + * set_memory_X(). __pa() on a "decoy" address results in a + * physical address with bit 63 set. + */ + return address & __PHYSICAL_MASK; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_MODE_WB @@ -533,6 +544,8 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, int is_range_ram; int err = 0; + start = sanitize_phys(start); + end = sanitize_phys(end); BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled()) { @@ -609,6 +622,9 @@ int free_memtype(u64 start, u64 end) if (!pat_enabled()) return 0; + start = sanitize_phys(start); + end = sanitize_phys(end); + /* Low ISA region is always mapped WB. No need to track */ if (x86_platform.is_untracked_pat_range(start, end)) return 0; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3ef095c70ae3..e848a4811785 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_table(tlb, pte); + paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + paravirt_tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(pud)); + paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(p4d)); + paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 752dbf4e0e50..9517d1b2a281 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> -#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; - bool need_flush; - u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * Even in lazy TLB mode, the CPU should stay set in the - * mm_cpumask. The TLB shootdown code can figure out from - * from cpu_tlbstate.is_lazy whether or not to send an IPI. + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - /* - * If the CPU is not in lazy TLB mode, we are just switching - * from one thread in a process to another thread in the same - * process. No TLB flush required. - */ - if (!was_lazy) - return; - - /* - * Read the tlb_gen to check whether a flush is needed. - * If the TLB is up to date, just use it. - * The barrier synchronizes with the tlb_gen increment in - * the TLB shootdown code. - */ - smp_mb(); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == - next_tlb_gen) - return; - - /* - * TLB contents went out of date while we were in lazy - * mode. Fall through to the TLB switching code below. - */ - new_asid = prev_asid; - need_flush = true; + return; } else { + u16 new_asid; + bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - this_cpu_write(cpu_tlbstate.is_lazy, true); + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } } /* @@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. - * - * This should be rare, with native_flush_tlb_others skipping - * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { - cpumask_var_t lazymask; - unsigned int cpu; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ + unsigned int cpu; + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) @@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - - /* - * A temporary cpumask is used in order to skip sending IPIs - * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm). - * If the allocation fails, simply IPI every CPU in mm_cpumask. - */ - if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) { - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - - cpumask_copy(lazymask, cpumask); - - for_each_cpu(cpu, lazymask) { - if (per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_clear_cpu(cpu, lazymask); - } - - smp_call_function_many(lazymask, flush_tlb_func_remote, + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); - - free_cpumask_var(lazymask); } /* @@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_cpu(); } -void tlb_flush_remove_tables_local(void *arg) -{ - struct mm_struct *mm = arg; - - if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && - this_cpu_read(cpu_tlbstate.is_lazy)) { - /* - * We're in lazy mode. We need to at least flush our - * paging-structure cache to avoid speculatively reading - * garbage into our TLB. Since switching to init_mm is barely - * slower than a minimal flush, just switch to init_mm. - */ - switch_mm_irqs_off(NULL, &init_mm, NULL); - } -} - -static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, - struct cpumask *lazy_cpus) -{ - int cpu; - - for_each_cpu(cpu, mm_cpumask(mm)) { - if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_set_cpu(cpu, lazy_cpus); - } -} - -void tlb_flush_remove_tables(struct mm_struct *mm) -{ - int cpu = get_cpu(); - cpumask_var_t lazy_cpus; - - if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { - put_cpu(); - return; - } - - if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { - /* - * If the cpumask allocation fails, do a brute force flush - * on all the CPUs that have this mm loaded. - */ - smp_call_function_many(mm_cpumask(mm), - tlb_flush_remove_tables_local, (void *)mm, 1); - put_cpu(); - return; - } - - /* - * CPUs with !is_lazy either received a TLB flush IPI while the user - * pages in this address range were unmapped, or have context switched - * and reloaded %CR3 since then. - * - * Shootdown IPIs at page table freeing time only need to be sent to - * CPUs that may have out of date TLB contents. - */ - mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); - smp_call_function_many(lazy_cpus, - tlb_flush_remove_tables_local, (void *)mm, 1); - free_cpumask_var(lazy_cpus); - put_cpu(); -} static void do_flush_tlb_all(void *info) { |