diff options
Diffstat (limited to 'arch/x86')
250 files changed, 2829 insertions, 2528 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2fa55851d2a9..2a1f0ce7c59a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,6 @@ config X86 select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER @@ -80,6 +79,7 @@ config X86 select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP @@ -91,6 +91,7 @@ config X86 select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_EBPF_JIT if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE @@ -111,6 +112,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER + select HAVE_GCC_PLUGINS select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_HW_BREAKPOINT select HAVE_IDE @@ -151,6 +153,7 @@ config X86 select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select PERF_EVENTS select RTC_LIB + select RTC_MC146818_LIB select SPARSE_IRQ select SRCU select SYSCTL_EXCEPTION_TRACE diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index be8e688fa0d4..12ea8f8384f4 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -96,7 +96,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE $(call if_changed,zoffset) -AFLAGS_header.o += -I$(obj) +AFLAGS_header.o += -I$(objtree)/$(obj) $(obj)/header.o: $(obj)/zoffset.h LDFLAGS_setup.elf := -T diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ff574dad95cc..94dd4a31f5b3 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1004,79 +1004,87 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, return status; } -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle, bool is64) -{ - struct efi_info *efi = &boot_params->efi_info; - unsigned long map_sz, key, desc_size; - efi_memory_desc_t *mem_map; +struct exit_boot_struct { + struct boot_params *boot_params; + struct efi_info *efi; struct setup_data *e820ext; - const char *signature; __u32 e820ext_size; - __u32 nr_desc, prev_nr_desc; - efi_status_t status; - __u32 desc_version; - bool called_exit = false; - u8 nr_entries; - int i; - - nr_desc = 0; - e820ext = NULL; - e820ext_size = 0; - -get_map: - status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size, - &desc_version, &key); - - if (status != EFI_SUCCESS) - return status; - - prev_nr_desc = nr_desc; - nr_desc = map_sz / desc_size; - if (nr_desc > prev_nr_desc && - nr_desc > ARRAY_SIZE(boot_params->e820_map)) { - u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map); - - status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size); - if (status != EFI_SUCCESS) - goto free_mem_map; + bool is64; +}; - efi_call_early(free_pool, mem_map); - goto get_map; /* Allocated memory, get map again */ +static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, + struct efi_boot_memmap *map, + void *priv) +{ + static bool first = true; + const char *signature; + __u32 nr_desc; + efi_status_t status; + struct exit_boot_struct *p = priv; + + if (first) { + nr_desc = *map->buff_size / *map->desc_size; + if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) { + u32 nr_e820ext = nr_desc - + ARRAY_SIZE(p->boot_params->e820_map); + + status = alloc_e820ext(nr_e820ext, &p->e820ext, + &p->e820ext_size); + if (status != EFI_SUCCESS) + return status; + } + first = false; } - signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; - memcpy(&efi->efi_loader_signature, signature, sizeof(__u32)); + signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - efi->efi_systab = (unsigned long)sys_table; - efi->efi_memdesc_size = desc_size; - efi->efi_memdesc_version = desc_version; - efi->efi_memmap = (unsigned long)mem_map; - efi->efi_memmap_size = map_sz; + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; #ifdef CONFIG_X86_64 - efi->efi_systab_hi = (unsigned long)sys_table >> 32; - efi->efi_memmap_hi = (unsigned long)mem_map >> 32; + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; #endif + return EFI_SUCCESS; +} + +static efi_status_t exit_boot(struct boot_params *boot_params, + void *handle, bool is64) +{ + unsigned long map_sz, key, desc_size, buff_size; + efi_memory_desc_t *mem_map; + struct setup_data *e820ext; + __u32 e820ext_size; + efi_status_t status; + __u32 desc_version; + struct efi_boot_memmap map; + struct exit_boot_struct priv; + + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; + priv.is64 = is64; + /* Might as well exit boot services now */ - status = efi_call_early(exit_boot_services, handle, key); - if (status != EFI_SUCCESS) { - /* - * ExitBootServices() will fail if any of the event - * handlers change the memory map. In which case, we - * must be prepared to retry, but only once so that - * we're guaranteed to exit on repeated failures instead - * of spinning forever. - */ - if (called_exit) - goto free_mem_map; - - called_exit = true; - efi_call_early(free_pool, mem_map); - goto get_map; - } + status = efi_exit_boot_services(sys_table, handle, &map, &priv, + exit_boot_func); + if (status != EFI_SUCCESS) + return status; + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; /* Historic? */ boot_params->alt_mem_k = 32 * 1024; @@ -1085,10 +1093,6 @@ get_map: return status; return EFI_SUCCESS; - -free_mem_map: - efi_call_early(free_pool, mem_map); - return status; } /* diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4e2ecfa23c15..4b429df40d7a 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1 +1,3 @@ CONFIG_NOHIGHMEM=y +# CONFIG_HIGHMEM4G is not set +# CONFIG_HIGHMEM64G is not set diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c index 89fa85e8b10c..6f97fb33ae21 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb.c +++ b/arch/x86/crypto/sha256-mb/sha256_mb.c @@ -485,10 +485,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S index b691da981cd9..a78a0694ddef 100644 --- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -265,13 +265,14 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2) vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 - movl _args_digest+4*32(state, idx, 4), tmp2_w + vmovd _args_digest(state , idx, 4) , %xmm0 vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1 vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1 - vmovdqu %xmm0, _result_digest(job_rax) - movl tmp2_w, _result_digest+1*16(job_rax) + vmovdqu %xmm0, _result_digest(job_rax) + offset = (_result_digest + 1*16) + vmovdqu %xmm1, offset(job_rax) pop %rbx diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c index f4cf5b78fd36..d210174a52b0 100644 --- a/arch/x86/crypto/sha512-mb/sha512_mb.c +++ b/arch/x86/crypto/sha512-mb/sha512_mb.c @@ -497,10 +497,10 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx, req = cast_mcryptd_ctx_to_req(req_ctx); if (irqs_disabled()) - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); else { local_bh_disable(); - rctx->complete(&req->base, ret); + req_ctx->complete(&req->base, ret); local_bh_enable(); } } diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fe91c25092da..77f28ce9c646 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -5,6 +5,8 @@ OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y +CFLAGS_syscall_64.o += -Wno-override-init +CFLAGS_syscall_32.o += -Wno-override-init obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += common.o diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9e1e27d31c6d..1433f6b4607d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -64,22 +64,16 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) } /* - * We can return 0 to resume the syscall or anything else to go to phase - * 2. If we resume the syscall, we need to put something appropriate in - * regs->orig_ax. - * - * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax - * are fully functional. - * - * For phase 2's benefit, our return value is: - * 0: resume the syscall - * 1: go to phase 2; no seccomp phase 2 needed - * anything else: go to phase 2; pass return value to seccomp + * Returns the syscall nr to run (which should match regs->orig_ax) or -1 + * to skip the syscall. */ -unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +static long syscall_trace_enter(struct pt_regs *regs) { + u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long ret = 0; + bool emulated = false; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) @@ -87,11 +81,19 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + if (unlikely(work & _TIF_SYSCALL_EMU)) + emulated = true; + + if ((emulated || (work & _TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + return -1L; + + if (emulated) + return -1L; + #ifdef CONFIG_SECCOMP /* - * Do seccomp first -- it should minimize exposure of other - * code, and keeping seccomp fast is probably more valuable - * than the rest of this. + * Do seccomp after ptrace, to catch any tracer changes. */ if (work & _TIF_SECCOMP) { struct seccomp_data sd; @@ -118,69 +120,12 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) sd.args[5] = regs->bp; } - BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); - BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); - - ret = seccomp_phase1(&sd); - if (ret == SECCOMP_PHASE1_SKIP) { - regs->orig_ax = -1; - ret = 0; - } else if (ret != SECCOMP_PHASE1_OK) { - return ret; /* Go directly to phase 2 */ - } - - work &= ~_TIF_SECCOMP; - } -#endif - - /* Do our best to finish without phase 2. */ - if (work == 0) - return ret; /* seccomp and/or nohz only (ret == 0 here) */ - -#ifdef CONFIG_AUDITSYSCALL - if (work == _TIF_SYSCALL_AUDIT) { - /* - * If there is no more work to be done except auditing, - * then audit in phase 1. Phase 2 always audits, so, if - * we audit here, then we can't go on to phase 2. - */ - do_audit_syscall_entry(regs, arch); - return 0; - } -#endif - - return 1; /* Something is enabled that we can't handle in phase 1 */ -} - -/* Returns the syscall nr to run (which should match regs->orig_ax). */ -long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, - unsigned long phase1_result) -{ - struct thread_info *ti = pt_regs_to_thread_info(regs); - long ret = 0; - u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; - - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - BUG_ON(regs != task_pt_regs(current)); - -#ifdef CONFIG_SECCOMP - /* - * Call seccomp_phase2 before running the other hooks so that - * they can see any changes made by a seccomp tracer. - */ - if (phase1_result > 1 && seccomp_phase2(phase1_result)) { - /* seccomp failures shouldn't expose any additional code. */ - return -1; + ret = __secure_computing(&sd); + if (ret == -1) + return ret; } #endif - if (unlikely(work & _TIF_SYSCALL_EMU)) - ret = -1L; - - if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && - tracehook_report_syscall_entry(regs)) - ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); @@ -189,17 +134,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, return ret ?: regs->orig_ax; } -long syscall_trace_enter(struct pt_regs *regs) -{ - u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); - - if (phase1_result == 0) - return regs->orig_ax; - else - return syscall_trace_enter_phase2(regs, arch, phase1_result); -} - #define EXIT_TO_USERMODE_LOOP_FLAGS \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY) @@ -270,8 +204,12 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) * handling, because syscall restart has a fixup for compat * syscalls. The fixup is exercised by the ptrace_syscall_32 * selftest. + * + * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer + * special case only applies after poking regs and before the + * very next return to user mode. */ - ti->status &= ~TS_COMPAT; + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b846875aeea6..d172c619c449 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -288,11 +288,15 @@ return_from_SYSCALL_64: jne opportunistic_sysret_failed /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: * * movq $stuck_here, %rcx * pushfq @@ -601,9 +605,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) .endm #endif +/* Make sure APIC interrupt handlers end up in the irqentry section: */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +# define POP_SECTION_IRQENTRY .popsection +#else +# define PUSH_SECTION_IRQENTRY +# define POP_SECTION_IRQENTRY +#endif + .macro apicinterrupt num sym do_sym +PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym trace_apicinterrupt \num \sym +POP_SECTION_IRQENTRY .endm #ifdef CONFIG_SMP diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 4cddd17153fb..f848572169ea 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -294,7 +294,7 @@ # 285 sys_setaltroot 286 i386 add_key sys_add_key 287 i386 request_key sys_request_key -288 i386 keyctl sys_keyctl +288 i386 keyctl sys_keyctl compat_sys_keyctl 289 i386 ioprio_set sys_ioprio_set 290 i386 ioprio_get sys_ioprio_get 291 i386 inotify_init sys_inotify_init diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 6ba89a1ab0e5..d5409660f5de 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -75,7 +75,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO -$(vobjs): KBUILD_CFLAGS += $(CFL) +$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL) # # vDSO code runs in userspace and -pg doesn't help with profiling anyway. @@ -145,6 +145,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 2f02d23a05ef..94d54d0defa7 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -96,9 +96,8 @@ static notrace cycle_t vread_pvclock(int *mode) { const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; cycle_t ret; - u64 tsc, pvti_tsc; - u64 last, delta, pvti_system_time; - u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift; + u64 last; + u32 version; /* * Note: The kernel and hypervisor must guarantee that cpu ID @@ -123,29 +122,15 @@ static notrace cycle_t vread_pvclock(int *mode) */ do { - version = pvti->version; - - smp_rmb(); + version = pvclock_read_begin(pvti); if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { *mode = VCLOCK_NONE; return 0; } - tsc = rdtsc_ordered(); - pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; - pvti_tsc_shift = pvti->tsc_shift; - pvti_system_time = pvti->system_time; - pvti_tsc = pvti->tsc_timestamp; - - /* Make sure that the version double-check is last. */ - smp_rmb(); - } while (unlikely((version & 1) || version != pvti->version)); - - delta = tsc - pvti_tsc; - ret = pvti_system_time + - pvclock_scale_delta(delta, pvti_tsc_to_system_mul, - pvti_tsc_shift); + ret = __pvclock_read_cycles(pvti); + } while (pvclock_read_retry(pvti, version)); /* refer to vread_tsc() comment for rationale */ last = gtod->cycle_last; diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 63a03bb91497..4f741192846d 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -22,6 +22,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); + if (hdr->e_type != ET_DYN) + fail("input is not a shared object\n"); + /* Walk the segment table. */ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { if (GET_LE(&pt[i].p_type) == PT_LOAD) { @@ -49,6 +52,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (stripped_len < load_size) fail("stripped input is too short\n"); + if (!dyn) + fail("input has no PT_DYNAMIC section -- your toolchain is buggy\n"); + /* Walk the dynamic table */ for (i = 0; dyn + i < dyn_end && GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 3329844e3c43..f840766659a8 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -331,15 +331,9 @@ static void vgetcpu_cpu_init(void *arg) write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } -static int -vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) +static int vgetcpu_online(unsigned int cpu) { - long cpu = (long)arg; - - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); - - return NOTIFY_DONE; + return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); } static int __init init_vdso(void) @@ -350,15 +344,9 @@ static int __init init_vdso(void) init_vdso_image(&vdso_image_x32); #endif - cpu_notifier_register_begin(); - - on_each_cpu(vgetcpu_cpu_init, NULL, 1); /* notifier priority > KVM */ - __hotcpu_notifier(vgetcpu_cpu_notifier, 30); - - cpu_notifier_register_done(); - - return 0; + return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, + "AP_X86_VDSO_VMA_ONLINE", vgetcpu_online, NULL); } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 75fc719b7f31..636c4b341f36 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -207,7 +207,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; - tmp = secure_computing(); + tmp = secure_computing(NULL); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index bd3e8421b57c..f5f4b3fbbbc2 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -119,8 +119,8 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ @@ -370,13 +370,13 @@ static int amd_pmu_cpu_prepare(int cpu) WARN_ON_ONCE(cpuc->amd_nb); if (!x86_pmu.amd_nb_constraints) - return NOTIFY_OK; + return 0; cpuc->amd_nb = amd_alloc_nb(cpu); if (!cpuc->amd_nb) - return NOTIFY_BAD; + return -ENOMEM; - return NOTIFY_OK; + return 0; } static void amd_pmu_cpu_starting(int cpu) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 72dea2f40fc4..b26ee32f73e8 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -7,7 +7,8 @@ */ #include <linux/perf_event.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/pci.h> #include <linux/ptrace.h> #include <linux/syscore_ops.h> @@ -725,13 +726,10 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init int perf_event_ibs_init(void) +static __init void perf_event_ibs_init(void) { struct attribute **attr = ibs_op_format_attrs; - if (!ibs_caps) - return -ENODEV; /* ibs not supported by the cpu */ - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); if (ibs_caps & IBS_CAPS_OPCNT) { @@ -742,13 +740,11 @@ static __init int perf_event_ibs_init(void) register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); - - return 0; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init int perf_event_ibs_init(void) { return 0; } +static __init void perf_event_ibs_init(void) { } #endif @@ -925,7 +921,7 @@ static inline int get_ibs_lvt_offset(void) return val & IBSCTL_LVT_OFFSET_MASK; } -static void setup_APIC_ibs(void *dummy) +static void setup_APIC_ibs(void) { int offset; @@ -940,7 +936,7 @@ failed: smp_processor_id()); } -static void clear_APIC_ibs(void *dummy) +static void clear_APIC_ibs(void) { int offset; @@ -949,18 +945,24 @@ static void clear_APIC_ibs(void *dummy) setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); } +static int x86_pmu_amd_ibs_starting_cpu(unsigned int cpu) +{ + setup_APIC_ibs(); + return 0; +} + #ifdef CONFIG_PM static int perf_ibs_suspend(void) { - clear_APIC_ibs(NULL); + clear_APIC_ibs(); return 0; } static void perf_ibs_resume(void) { ibs_eilvt_setup(); - setup_APIC_ibs(NULL); + setup_APIC_ibs(); } static struct syscore_ops perf_ibs_syscore_ops = { @@ -979,27 +981,15 @@ static inline void perf_ibs_pm_init(void) { } #endif -static int -perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +static int x86_pmu_amd_ibs_dying_cpu(unsigned int cpu) { - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - setup_APIC_ibs(NULL); - break; - case CPU_DYING: - clear_APIC_ibs(NULL); - break; - default: - break; - } - - return NOTIFY_OK; + clear_APIC_ibs(); + return 0; } static __init int amd_ibs_init(void) { u32 caps; - int ret = -EINVAL; caps = __get_ibs_caps(); if (!caps) @@ -1008,22 +998,25 @@ static __init int amd_ibs_init(void) ibs_eilvt_setup(); if (!ibs_eilvt_valid()) - goto out; + return -EINVAL; perf_ibs_pm_init(); - cpu_notifier_register_begin(); + ibs_caps = caps; /* make ibs_caps visible to other cpus: */ smp_mb(); - smp_call_function(setup_APIC_ibs, NULL, 1); - __perf_cpu_notifier(perf_ibs_cpu_notifier); - cpu_notifier_register_done(); + /* + * x86_pmu_amd_ibs_starting_cpu will be called from core on + * all online cpus. + */ + cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_IBS_STARTING, + "AP_PERF_X86_AMD_IBS_STARTING", + x86_pmu_amd_ibs_starting_cpu, + x86_pmu_amd_ibs_dying_cpu); - ret = perf_event_ibs_init(); -out: - if (ret) - pr_err("Failed to setup IBS, %d\n", ret); - return ret; + perf_event_ibs_init(); + + return 0; } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 6011a573dd64..b28200dea715 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -12,7 +12,7 @@ */ #include <linux/perf_event.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/cpumask.h> #include <linux/slab.h> diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 55a3529dbf12..9842270ed2f2 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -228,12 +228,12 @@ static struct pmu pmu_class = { .read = pmu_event_read, }; -static void power_cpu_exit(int cpu) +static int power_cpu_exit(unsigned int cpu) { int target; if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask)) - return; + return 0; /* * Find a new CPU on the same compute unit, if was set in cpumask @@ -245,9 +245,10 @@ static void power_cpu_exit(int cpu) cpumask_set_cpu(target, &cpu_mask); perf_pmu_migrate_context(&pmu_class, cpu, target); } + return 0; } -static void power_cpu_init(int cpu) +static int power_cpu_init(unsigned int cpu) { int target; @@ -255,7 +256,7 @@ static void power_cpu_init(int cpu) * 1) If any CPU is set at cpu_mask in the same compute unit, do * nothing. * 2) If no CPU is set at cpu_mask in the same compute unit, - * set current STARTING CPU. + * set current ONLINE CPU. * * Note: if there is a CPU aside of the new one already in the * sibling mask, then it is also in cpu_mask. @@ -263,33 +264,9 @@ static void power_cpu_init(int cpu) target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); if (target >= nr_cpumask_bits) cpumask_set_cpu(cpu, &cpu_mask); + return 0; } -static int -power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_STARTING: - power_cpu_init(cpu); - break; - case CPU_DOWN_PREPARE: - power_cpu_exit(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block power_cpu_notifier_nb = { - .notifier_call = power_cpu_notifier, - .priority = CPU_PRI_PERF, -}; - static const struct x86_cpu_id cpu_match[] = { { .vendor = X86_VENDOR_AMD, .family = 0x15 }, {}, @@ -297,7 +274,7 @@ static const struct x86_cpu_id cpu_match[] = { static int __init amd_power_pmu_init(void) { - int cpu, target, ret; + int ret; if (!x86_match_cpu(cpu_match)) return 0; @@ -312,38 +289,25 @@ static int __init amd_power_pmu_init(void) return -ENODEV; } - cpu_notifier_register_begin(); - /* Choose one online core of each compute unit. */ - for_each_online_cpu(cpu) { - target = cpumask_first(topology_sibling_cpumask(cpu)); - if (!cpumask_test_cpu(target, &cpu_mask)) - cpumask_set_cpu(target, &cpu_mask); - } + cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, + "AP_PERF_X86_AMD_POWER_ONLINE", + power_cpu_init, power_cpu_exit); ret = perf_pmu_register(&pmu_class, "power", -1); if (WARN_ON(ret)) { pr_warn("AMD Power PMU registration failed\n"); - goto out; + return ret; } - __register_cpu_notifier(&power_cpu_notifier_nb); - pr_info("AMD Power PMU detected\n"); - -out: - cpu_notifier_register_done(); - return ret; } module_init(amd_power_pmu_init); static void __exit amd_power_pmu_exit(void) { - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&power_cpu_notifier_nb); - cpu_notifier_register_done(); - + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_AMD_POWER_ONLINE); perf_pmu_unregister(&pmu_class); } module_exit(amd_power_pmu_exit); diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 98ac57381bf9..65577f081d07 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -29,6 +29,8 @@ #define COUNTER_SHIFT 16 +static HLIST_HEAD(uncore_unused_list); + struct amd_uncore { int id; int refcnt; @@ -39,7 +41,7 @@ struct amd_uncore { cpumask_t *active_mask; struct pmu *pmu; struct perf_event *events[MAX_COUNTERS]; - struct amd_uncore *free_when_cpu_online; + struct hlist_node node; }; static struct amd_uncore * __percpu *amd_uncore_nb; @@ -306,6 +308,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } @@ -319,6 +322,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_l2->active_mask = &amd_l2_active_mask; uncore_l2->pmu = &amd_l2_pmu; + uncore_l2->id = -1; *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; } @@ -348,7 +352,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this, continue; if (this->id == that->id) { - that->free_when_cpu_online = this; + hlist_add_head(&this->node, &uncore_unused_list); this = that; break; } @@ -358,7 +362,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this, return this; } -static void amd_uncore_cpu_starting(unsigned int cpu) +static int amd_uncore_cpu_starting(unsigned int cpu) { unsigned int eax, ebx, ecx, edx; struct amd_uncore *uncore; @@ -384,6 +388,19 @@ static void amd_uncore_cpu_starting(unsigned int cpu) uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; } + + return 0; +} + +static void uncore_clean_online(void) +{ + struct amd_uncore *uncore; + struct hlist_node *n; + + hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { + hlist_del(&uncore->node); + kfree(uncore); + } } static void uncore_online(unsigned int cpu, @@ -391,20 +408,21 @@ static void uncore_online(unsigned int cpu, { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); - kfree(uncore->free_when_cpu_online); - uncore->free_when_cpu_online = NULL; + uncore_clean_online(); if (cpu == uncore->cpu) cpumask_set_cpu(cpu, uncore->active_mask); } -static void amd_uncore_cpu_online(unsigned int cpu) +static int amd_uncore_cpu_online(unsigned int cpu) { if (amd_uncore_nb) uncore_online(cpu, amd_uncore_nb); if (amd_uncore_l2) uncore_online(cpu, amd_uncore_l2); + + return 0; } static void uncore_down_prepare(unsigned int cpu, @@ -433,13 +451,15 @@ static void uncore_down_prepare(unsigned int cpu, } } -static void amd_uncore_cpu_down_prepare(unsigned int cpu) +static int amd_uncore_cpu_down_prepare(unsigned int cpu) { if (amd_uncore_nb) uncore_down_prepare(cpu, amd_uncore_nb); if (amd_uncore_l2) uncore_down_prepare(cpu, amd_uncore_l2); + + return 0; } static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) @@ -454,74 +474,19 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) *per_cpu_ptr(uncores, cpu) = NULL; } -static void amd_uncore_cpu_dead(unsigned int cpu) +static int amd_uncore_cpu_dead(unsigned int cpu) { if (amd_uncore_nb) uncore_dead(cpu, amd_uncore_nb); if (amd_uncore_l2) uncore_dead(cpu, amd_uncore_l2); -} - -static int -amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - if (amd_uncore_cpu_up_prepare(cpu)) - return notifier_from_errno(-ENOMEM); - break; - - case CPU_STARTING: - amd_uncore_cpu_starting(cpu); - break; - - case CPU_ONLINE: - amd_uncore_cpu_online(cpu); - break; - - case CPU_DOWN_PREPARE: - amd_uncore_cpu_down_prepare(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DEAD: - amd_uncore_cpu_dead(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block amd_uncore_cpu_notifier_block = { - .notifier_call = amd_uncore_cpu_notifier, - .priority = CPU_PRI_PERF + 1, -}; - -static void __init init_cpu_already_online(void *dummy) -{ - unsigned int cpu = smp_processor_id(); - - amd_uncore_cpu_starting(cpu); - amd_uncore_cpu_online(cpu); -} -static void cleanup_cpu_online(void *dummy) -{ - unsigned int cpu = smp_processor_id(); - - amd_uncore_cpu_dead(cpu); + return 0; } static int __init amd_uncore_init(void) { - unsigned int cpu, cpu2; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) @@ -558,38 +523,29 @@ static int __init amd_uncore_init(void) ret = 0; } - if (ret) - goto fail_nodev; - - cpu_notifier_register_begin(); - - /* init cpus already online before registering for hotplug notifier */ - for_each_online_cpu(cpu) { - ret = amd_uncore_cpu_up_prepare(cpu); - if (ret) - goto fail_online; - smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); - } - - __register_cpu_notifier(&amd_uncore_cpu_notifier_block); - cpu_notifier_register_done(); - + /* + * Install callbacks. Core will call them for each online cpu. + */ + if (cpuhp_setup_state(CPUHP_PERF_X86_AMD_UNCORE_PREP, + "PERF_X86_AMD_UNCORE_PREP", + amd_uncore_cpu_up_prepare, amd_uncore_cpu_dead)) + goto fail_l2; + + if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, + "AP_PERF_X86_AMD_UNCORE_STARTING", + amd_uncore_cpu_starting, NULL)) + goto fail_prep; + if (cpuhp_setup_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, + "AP_PERF_X86_AMD_UNCORE_ONLINE", + amd_uncore_cpu_online, + amd_uncore_cpu_down_prepare)) + goto fail_start; return 0; - -fail_online: - for_each_online_cpu(cpu2) { - if (cpu2 == cpu) - break; - smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1); - } - cpu_notifier_register_done(); - - /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */ - amd_uncore_nb = amd_uncore_l2 = NULL; - - if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) - perf_pmu_unregister(&amd_l2_pmu); +fail_start: + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); +fail_prep: + cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); fail_l2: if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index dfebbde2a4cc..d0efb5cb1b00 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -17,7 +17,8 @@ #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/kdebug.h> #include <linux/sched.h> #include <linux/uaccess.h> @@ -262,10 +263,13 @@ static bool check_hw_exists(void) return true; msr_fail: - pr_cont("Broken PMU hardware detected, using software events only.\n"); - printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n", - boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR, - reg, val_new); + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + pr_cont("PMU not available due to virtualization, using software events only.\n"); + } else { + pr_cont("Broken PMU hardware detected, using software events only.\n"); + pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", + reg, val_new); + } return false; } @@ -1477,49 +1481,49 @@ NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; -static int -x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +static int x86_pmu_prepare_cpu(unsigned int cpu) { - unsigned int cpu = (long)hcpu; struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - int i, ret = NOTIFY_OK; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) - cpuc->kfree_on_online[i] = NULL; - if (x86_pmu.cpu_prepare) - ret = x86_pmu.cpu_prepare(cpu); - break; - - case CPU_STARTING: - if (x86_pmu.cpu_starting) - x86_pmu.cpu_starting(cpu); - break; + int i; - case CPU_ONLINE: - for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { - kfree(cpuc->kfree_on_online[i]); - cpuc->kfree_on_online[i] = NULL; - } - break; + for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) + cpuc->kfree_on_online[i] = NULL; + if (x86_pmu.cpu_prepare) + return x86_pmu.cpu_prepare(cpu); + return 0; +} - case CPU_DYING: - if (x86_pmu.cpu_dying) - x86_pmu.cpu_dying(cpu); - break; +static int x86_pmu_dead_cpu(unsigned int cpu) +{ + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + return 0; +} - case CPU_UP_CANCELED: - case CPU_DEAD: - if (x86_pmu.cpu_dead) - x86_pmu.cpu_dead(cpu); - break; +static int x86_pmu_online_cpu(unsigned int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int i; - default: - break; + for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { + kfree(cpuc->kfree_on_online[i]); + cpuc->kfree_on_online[i] = NULL; } + return 0; +} - return ret; +static int x86_pmu_starting_cpu(unsigned int cpu) +{ + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); + return 0; +} + +static int x86_pmu_dying_cpu(unsigned int cpu) +{ + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + return 0; } static void __init pmu_check_apic(void) @@ -1787,10 +1791,39 @@ static int __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); - perf_cpu_notifier(x86_pmu_notifier); + /* + * Install callbacks. Core will call them for each online + * cpu. + */ + err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "PERF_X86_PREPARE", + x86_pmu_prepare_cpu, x86_pmu_dead_cpu); + if (err) + return err; + + err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, + "AP_PERF_X86_STARTING", x86_pmu_starting_cpu, + x86_pmu_dying_cpu); + if (err) + goto out; + + err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "AP_PERF_X86_ONLINE", + x86_pmu_online_cpu, NULL); + if (err) + goto out1; + + err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + if (err) + goto out2; return 0; + +out2: + cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); +out1: + cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); +out: + cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); + return err; } early_initcall(init_hw_perf_events); diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 0a6e393a2e62..982c9e31daca 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -31,7 +31,17 @@ struct bts_ctx { struct perf_output_handle handle; struct debug_store ds_back; - int started; + int state; +}; + +/* BTS context states: */ +enum { + /* no ongoing AUX transactions */ + BTS_STATE_STOPPED = 0, + /* AUX transaction is on, BTS tracing is disabled */ + BTS_STATE_INACTIVE, + /* AUX transaction is on, BTS tracing is running */ + BTS_STATE_ACTIVE, }; static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); @@ -204,6 +214,15 @@ static void bts_update(struct bts_ctx *bts) static int bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); +/* + * Ordering PMU callbacks wrt themselves and the PMI is done by means + * of bts::state, which: + * - is set when bts::handle::event is valid, that is, between + * perf_aux_output_begin() and perf_aux_output_end(); + * - is zero otherwise; + * - is ordered against bts::handle::event with a compiler barrier. + */ + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); @@ -221,10 +240,13 @@ static void __bts_event_start(struct perf_event *event) /* * local barrier to make sure that ds configuration made it - * before we enable BTS + * before we enable BTS and bts::state goes ACTIVE */ wmb(); + /* INACTIVE/STOPPED -> ACTIVE */ + WRITE_ONCE(bts->state, BTS_STATE_ACTIVE); + intel_pmu_enable_bts(config); } @@ -251,9 +273,6 @@ static void bts_event_start(struct perf_event *event, int flags) __bts_event_start(event); - /* PMI handler: this counter is running and likely generating PMIs */ - ACCESS_ONCE(bts->started) = 1; - return; fail_end_stop: @@ -263,30 +282,34 @@ fail_stop: event->hw.state = PERF_HES_STOPPED; } -static void __bts_event_stop(struct perf_event *event) +static void __bts_event_stop(struct perf_event *event, int state) { + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ + WRITE_ONCE(bts->state, state); + /* * No extra synchronization is mandated by the documentation to have * BTS data stores globally visible. */ intel_pmu_disable_bts(); - - if (event->hw.state & PERF_HES_STOPPED) - return; - - ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; } static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *buf = NULL; + int state = READ_ONCE(bts->state); - /* PMI handler: don't restart this counter */ - ACCESS_ONCE(bts->started) = 0; + if (state == BTS_STATE_ACTIVE) + __bts_event_stop(event, BTS_STATE_STOPPED); - __bts_event_stop(event); + if (state != BTS_STATE_STOPPED) + buf = perf_get_aux(&bts->handle); + + event->hw.state |= PERF_HES_STOPPED; if (flags & PERF_EF_UPDATE) { bts_update(bts); @@ -296,6 +319,7 @@ static void bts_event_stop(struct perf_event *event, int flags) bts->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); } @@ -310,8 +334,20 @@ static void bts_event_stop(struct perf_event *event, int flags) void intel_bts_enable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + int state = READ_ONCE(bts->state); + + /* + * Here we transition from INACTIVE to ACTIVE; + * if we instead are STOPPED from the interrupt handler, + * stay that way. Can't be ACTIVE here though. + */ + if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE)) + return; + + if (state == BTS_STATE_STOPPED) + return; - if (bts->handle.event && bts->started) + if (bts->handle.event) __bts_event_start(bts->handle.event); } @@ -319,8 +355,15 @@ void intel_bts_disable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + /* + * Here we transition from ACTIVE to INACTIVE; + * do nothing for STOPPED or INACTIVE. + */ + if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE) + return; + if (bts->handle.event) - __bts_event_stop(bts->handle.event); + __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE); } static int @@ -335,8 +378,6 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) return 0; head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); - if (WARN_ON_ONCE(head != local_read(&buf->head))) - return -EINVAL; phys = &buf->buf[buf->cur_buf]; space = phys->offset + phys->displacement + phys->size - head; @@ -403,22 +444,37 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) int intel_bts_interrupt(void) { + struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct perf_event *event = bts->handle.event; struct bts_buffer *buf; s64 old_head; - int err; + int err = -ENOSPC, handled = 0; - if (!event || !bts->started) - return 0; + /* + * The only surefire way of knowing if this NMI is ours is by checking + * the write ptr against the PMI threshold. + */ + if (ds && (ds->bts_index >= ds->bts_interrupt_threshold)) + handled = 1; + + /* + * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, + * so we can only be INACTIVE or STOPPED + */ + if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) + return handled; buf = perf_get_aux(&bts->handle); + if (!buf) + return handled; + /* * Skip snapshot counters: they don't use the interrupt, but * there's no other way of telling, because the pointer will * keep moving */ - if (!buf || buf->snapshot) + if (buf->snapshot) return 0; old_head = local_read(&buf->head); @@ -426,18 +482,27 @@ int intel_bts_interrupt(void) /* no new data */ if (old_head == local_read(&buf->head)) - return 0; + return handled; perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return 1; + if (buf) + err = bts_buffer_reset(buf, &bts->handle); + + if (err) { + WRITE_ONCE(bts->state, BTS_STATE_STOPPED); - err = bts_buffer_reset(buf, &bts->handle); - if (err) - perf_aux_output_end(&bts->handle, 0, false); + if (buf) { + /* + * BTS_STATE_STOPPED should be visible before + * cleared handle::event + */ + barrier(); + perf_aux_output_end(&bts->handle, 0, false); + } + } return 1; } @@ -519,7 +584,8 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; - bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | + PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; bts_pmu.event_init = bts_event_init; bts_pmu.add = bts_event_add; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 0974ba11e954..4c9a79b9cd69 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1730,9 +1730,11 @@ static __initconst const u64 knl_hw_cache_extra_regs * disabled state if called consecutively. * * During consecutive calls, the same disable value will be written to related - * registers, so the PMU state remains unchanged. hw.state in - * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive - * calls. + * registers, so the PMU state remains unchanged. + * + * intel_bts events don't coexist with intel PMU's BTS events because of + * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them + * disabled around intel PMU's event batching etc, only inside the PMI handler. */ static void __intel_pmu_disable_all(void) { @@ -1742,8 +1744,6 @@ static void __intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); - else - intel_bts_disable_local(); intel_pmu_pebs_disable_all(); } @@ -1771,8 +1771,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) return; intel_pmu_enable_bts(event->hw.config); - } else - intel_bts_enable_local(); + } } static void intel_pmu_enable_all(int added) @@ -2073,6 +2072,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_bts_disable_local(); __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); @@ -2172,6 +2172,7 @@ done: /* Only restore PMU state when it's active. See x86_pmu_disable(). */ if (cpuc->enabled) __intel_pmu_enable_all(0, true); + intel_bts_enable_local(); /* * Only unmask the NMI after the overflow counters @@ -3109,7 +3110,7 @@ static int intel_pmu_cpu_prepare(int cpu) cpuc->excl_thread_id = 0; } - return NOTIFY_OK; + return 0; err_constraint_list: kfree(cpuc->constraint_list); @@ -3120,7 +3121,7 @@ err_shared_regs: cpuc->shared_regs = NULL; err: - return NOTIFY_BAD; + return -ENOMEM; } static void intel_pmu_cpu_starting(int cpu) diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 7b5fd811ef45..8f82b02934fa 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -458,6 +458,11 @@ static void __intel_cqm_event_count(void *info); static void init_mbm_sample(u32 rmid, u32 evt_type); static void __intel_mbm_event_count(void *info); +static bool is_cqm_event(int e) +{ + return (e == QOS_L3_OCCUP_EVENT_ID); +} + static bool is_mbm_event(int e) { return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); @@ -1366,6 +1371,10 @@ static int intel_cqm_event_init(struct perf_event *event) (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) return -EINVAL; + if ((is_cqm_event(event->attr.config) && !cqm_enabled) || + (is_mbm_event(event->attr.config) && !mbm_enabled)) + return -EINVAL; + /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || @@ -1577,7 +1586,7 @@ static inline void cqm_pick_event_reader(int cpu) cpumask_set_cpu(cpu, &cqm_cpumask); } -static void intel_cqm_cpu_starting(unsigned int cpu) +static int intel_cqm_cpu_starting(unsigned int cpu) { struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1588,39 +1597,26 @@ static void intel_cqm_cpu_starting(unsigned int cpu) WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); + + cqm_pick_event_reader(cpu); + return 0; } -static void intel_cqm_cpu_exit(unsigned int cpu) +static int intel_cqm_cpu_exit(unsigned int cpu) { int target; /* Is @cpu the current cqm reader for this package ? */ if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) - return; + return 0; /* Find another online reader in this package */ target = cpumask_any_but(topology_core_cpumask(cpu), cpu); if (target < nr_cpu_ids) cpumask_set_cpu(target, &cqm_cpumask); -} -static int intel_cqm_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - intel_cqm_cpu_exit(cpu); - break; - case CPU_STARTING: - intel_cqm_cpu_starting(cpu); - cqm_pick_event_reader(cpu); - break; - } - - return NOTIFY_OK; + return 0; } static const struct x86_cpu_id intel_cqm_match[] = { @@ -1682,7 +1678,7 @@ out: static int __init intel_cqm_init(void) { char *str = NULL, scale[20]; - int i, cpu, ret; + int cpu, ret; if (x86_match_cpu(intel_cqm_match)) cqm_enabled = true; @@ -1705,8 +1701,7 @@ static int __init intel_cqm_init(void) * * Also, check that the scales match on all cpus. */ - cpu_notifier_register_begin(); - + get_online_cpus(); for_each_online_cpu(cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -1743,11 +1738,6 @@ static int __init intel_cqm_init(void) if (ret) goto out; - for_each_online_cpu(i) { - intel_cqm_cpu_starting(i); - cqm_pick_event_reader(i); - } - if (mbm_enabled) ret = intel_mbm_init(); if (ret && !cqm_enabled) @@ -1772,12 +1762,18 @@ static int __init intel_cqm_init(void) pr_info("Intel MBM enabled\n"); /* - * Register the hot cpu notifier once we are sure cqm + * Setup the hot cpu notifier once we are sure cqm * is enabled to avoid notifier leak. */ - __perf_cpu_notifier(intel_cqm_cpu_notifier); + cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING, + "AP_PERF_X86_CQM_STARTING", + intel_cqm_cpu_starting, NULL); + cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "AP_PERF_X86_CQM_ONLINE", + NULL, intel_cqm_cpu_exit); + out: - cpu_notifier_register_done(); + put_online_cpus(); + if (ret) { kfree(str); cqm_cleanup(); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4c7638b91fa5..3ca87b5a8677 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -366,7 +366,7 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode) * Check if exiting cpu is the designated reader. If so migrate the * events when there is a valid target available */ -static void cstate_cpu_exit(int cpu) +static int cstate_cpu_exit(unsigned int cpu) { unsigned int target; @@ -391,9 +391,10 @@ static void cstate_cpu_exit(int cpu) perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target); } } + return 0; } -static void cstate_cpu_init(int cpu) +static int cstate_cpu_init(unsigned int cpu) { unsigned int target; @@ -415,31 +416,10 @@ static void cstate_cpu_init(int cpu) topology_core_cpumask(cpu)); if (has_cstate_pkg && target >= nr_cpu_ids) cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); -} -static int cstate_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: - cstate_cpu_init(cpu); - break; - case CPU_DOWN_PREPARE: - cstate_cpu_exit(cpu); - break; - default: - break; - } - return NOTIFY_OK; + return 0; } -static struct notifier_block cstate_cpu_nb = { - .notifier_call = cstate_cpu_notifier, - .priority = CPU_PRI_PERF + 1, -}; - static struct pmu cstate_core_pmu = { .attr_groups = core_attr_groups, .name = "cstate_core", @@ -600,18 +580,20 @@ static inline void cstate_cleanup(void) static int __init cstate_init(void) { - int cpu, err; + int err; - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - cstate_cpu_init(cpu); + cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING, + "AP_PERF_X86_CSTATE_STARTING", cstate_cpu_init, + NULL); + cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE, + "AP_PERF_X86_CSTATE_ONLINE", NULL, cstate_cpu_exit); if (has_cstate_core) { err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1); if (err) { has_cstate_core = false; pr_info("Failed to register cstate core pmu\n"); - goto out; + return err; } } @@ -621,12 +603,10 @@ static int __init cstate_init(void) has_cstate_pkg = false; pr_info("Failed to register cstate pkg pmu\n"); cstate_cleanup(); - goto out; + return err; } } - __register_cpu_notifier(&cstate_cpu_nb); -out: - cpu_notifier_register_done(); + return err; } @@ -652,9 +632,8 @@ module_init(cstate_pmu_init); static void __exit cstate_pmu_exit(void) { - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&cstate_cpu_nb); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING); cstate_cleanup(); - cpu_notifier_register_done(); } module_exit(cstate_pmu_exit); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ce9f3f669e6..9b983a474253 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1274,18 +1274,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct pebs_record_nhm *p = at; u64 pebs_status; - /* PEBS v3 has accurate status bits */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&p->status, - MAX_PEBS_EVENTS) + for_each_set_bit(bit, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) counts[bit]++; continue; } - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; - /* * On some CPUs the PEBS status can be zero when PEBS is * racing with clearing of GLOBAL_STATUS. @@ -1333,8 +1333,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; event = cpuc->events[bit]; - WARN_ON_ONCE(!event); - WARN_ON_ONCE(!event->attr.precise_ip); + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; /* log dropped samples number */ if (error[bit]) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 04bb5fb5a8d7..861a7d9cb60f 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1074,6 +1074,11 @@ static void pt_addr_filters_fini(struct perf_event *event) event->hw.addr_filters = NULL; } +static inline bool valid_kernel_ip(unsigned long ip) +{ + return virt_addr_valid(ip) && kernel_ip(ip); +} + static int pt_event_addr_filters_validate(struct list_head *filters) { struct perf_addr_filter *filter; @@ -1081,11 +1086,16 @@ static int pt_event_addr_filters_validate(struct list_head *filters) list_for_each_entry(filter, filters, entry) { /* PT doesn't support single address triggers */ - if (!filter->range) + if (!filter->range || !filter->size) return -EOPNOTSUPP; - if (!filter->inode && !kernel_ip(filter->offset)) - return -EINVAL; + if (!filter->inode) { + if (!valid_kernel_ip(filter->offset)) + return -EINVAL; + + if (!valid_kernel_ip(filter->offset + filter->size)) + return -EINVAL; + } if (++range > pt_cap_get(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; @@ -1111,7 +1121,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event) } else { /* apply the offset */ msr_a = filter->offset + offs[range]; - msr_b = filter->size + msr_a; + msr_b = filter->size + msr_a - 1; } filters->filter[range].msr_a = msr_a; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index d0c58b35155f..28865938aadf 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -556,14 +556,14 @@ const struct attribute_group *rapl_attr_groups[] = { NULL, }; -static void rapl_cpu_exit(int cpu) +static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; /* Check if exiting cpu is used for collecting rapl events */ if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) - return; + return 0; pmu->cpu = -1; /* Find a new cpu to collect rapl events */ @@ -575,9 +575,10 @@ static void rapl_cpu_exit(int cpu) pmu->cpu = target; perf_pmu_migrate_context(pmu->pmu, cpu, target); } + return 0; } -static void rapl_cpu_init(int cpu) +static int rapl_cpu_online(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); int target; @@ -588,13 +589,14 @@ static void rapl_cpu_init(int cpu) */ target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu)); if (target < nr_cpu_ids) - return; + return 0; cpumask_set_cpu(cpu, &rapl_cpu_mask); pmu->cpu = cpu; + return 0; } -static int rapl_cpu_prepare(int cpu) +static int rapl_cpu_prepare(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -615,33 +617,6 @@ static int rapl_cpu_prepare(int cpu) return 0; } -static int rapl_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - rapl_cpu_prepare(cpu); - break; - - case CPU_DOWN_FAILED: - case CPU_ONLINE: - rapl_cpu_init(cpu); - break; - - case CPU_DOWN_PREPARE: - rapl_cpu_exit(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rapl_cpu_nb = { - .notifier_call = rapl_cpu_notifier, - .priority = CPU_PRI_PERF + 1, -}; - static int rapl_check_hw_unit(bool apply_quirk) { u64 msr_rapl_power_unit_bits; @@ -692,24 +667,6 @@ static void __init rapl_advertise(void) } } -static int __init rapl_prepare_cpus(void) -{ - unsigned int cpu, pkg; - int ret; - - for_each_online_cpu(cpu) { - pkg = topology_logical_package_id(cpu); - if (rapl_pmus->pmus[pkg]) - continue; - - ret = rapl_cpu_prepare(cpu); - if (ret) - return ret; - rapl_cpu_init(cpu); - } - return 0; -} - static void cleanup_rapl_pmus(void) { int i; @@ -837,35 +794,44 @@ static int __init rapl_pmu_init(void) if (ret) return ret; - cpu_notifier_register_begin(); + /* + * Install callbacks. Core will call them for each online cpu. + */ - ret = rapl_prepare_cpus(); + ret = cpuhp_setup_state(CPUHP_PERF_X86_RAPL_PREP, "PERF_X86_RAPL_PREP", + rapl_cpu_prepare, NULL); if (ret) goto out; + ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, + "AP_PERF_X86_RAPL_ONLINE", + rapl_cpu_online, rapl_cpu_offline); + if (ret) + goto out1; + ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); if (ret) - goto out; + goto out2; - __register_cpu_notifier(&rapl_cpu_nb); - cpu_notifier_register_done(); rapl_advertise(); return 0; +out2: + cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); +out1: + cpuhp_remove_state(CPUHP_PERF_X86_RAPL_PREP); out: pr_warn("Initialization failed (%d), disabled\n", ret); cleanup_rapl_pmus(); - cpu_notifier_register_done(); return ret; } module_init(rapl_pmu_init); static void __exit intel_rapl_exit(void) { - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&rapl_cpu_nb); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_PERF_X86_RAPL_PREP); perf_pmu_unregister(&rapl_pmus->pmu); cleanup_rapl_pmus(); - cpu_notifier_register_done(); } module_exit(intel_rapl_exit); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 59b4974c697f..463dc7a5a6c3 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1,3 +1,5 @@ +#include <linux/module.h> + #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "uncore.h" @@ -1052,7 +1054,7 @@ static void uncore_pci_exit(void) } } -static void uncore_cpu_dying(int cpu) +static int uncore_cpu_dying(unsigned int cpu) { struct intel_uncore_type *type, **types = uncore_msr_uncores; struct intel_uncore_pmu *pmu; @@ -1069,16 +1071,19 @@ static void uncore_cpu_dying(int cpu) uncore_box_exit(box); } } + return 0; } -static void uncore_cpu_starting(int cpu, bool init) +static int first_init; + +static int uncore_cpu_starting(unsigned int cpu) { struct intel_uncore_type *type, **types = uncore_msr_uncores; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; int i, pkg, ncpus = 1; - if (init) { + if (first_init) { /* * On init we get the number of online cpus in the package * and set refcount for all of them. @@ -1099,9 +1104,11 @@ static void uncore_cpu_starting(int cpu, bool init) uncore_box_init(box); } } + + return 0; } -static int uncore_cpu_prepare(int cpu) +static int uncore_cpu_prepare(unsigned int cpu) { struct intel_uncore_type *type, **types = uncore_msr_uncores; struct intel_uncore_pmu *pmu; @@ -1164,13 +1171,13 @@ static void uncore_change_context(struct intel_uncore_type **uncores, uncore_change_type_ctx(*uncores, old_cpu, new_cpu); } -static void uncore_event_exit_cpu(int cpu) +static int uncore_event_cpu_offline(unsigned int cpu) { int target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) - return; + return 0; /* Find a new cpu to collect uncore events */ target = cpumask_any_but(topology_core_cpumask(cpu), cpu); @@ -1183,9 +1190,10 @@ static void uncore_event_exit_cpu(int cpu) uncore_change_context(uncore_msr_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); + return 0; } -static void uncore_event_init_cpu(int cpu) +static int uncore_event_cpu_online(unsigned int cpu) { int target; @@ -1195,50 +1203,15 @@ static void uncore_event_init_cpu(int cpu) */ target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu)); if (target < nr_cpu_ids) - return; + return 0; cpumask_set_cpu(cpu, &uncore_cpu_mask); uncore_change_context(uncore_msr_uncores, -1, cpu); uncore_change_context(uncore_pci_uncores, -1, cpu); + return 0; } -static int uncore_cpu_notifier(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - return notifier_from_errno(uncore_cpu_prepare(cpu)); - - case CPU_STARTING: - uncore_cpu_starting(cpu, false); - case CPU_DOWN_FAILED: - uncore_event_init_cpu(cpu); - break; - - case CPU_UP_CANCELED: - case CPU_DYING: - uncore_cpu_dying(cpu); - break; - - case CPU_DOWN_PREPARE: - uncore_event_exit_cpu(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block uncore_cpu_nb = { - .notifier_call = uncore_cpu_notifier, - /* - * to migrate uncore events, our notifier should be executed - * before perf core's notifier. - */ - .priority = CPU_PRI_PERF + 1, -}; - static int __init type_pmu_register(struct intel_uncore_type *type) { int i, ret; @@ -1282,41 +1255,6 @@ err: return ret; } -static void __init uncore_cpu_setup(void *dummy) -{ - uncore_cpu_starting(smp_processor_id(), true); -} - -/* Lazy to avoid allocation of a few bytes for the normal case */ -static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC); - -static int __init uncore_cpumask_init(bool msr) -{ - unsigned int cpu; - - for_each_online_cpu(cpu) { - unsigned int pkg = topology_logical_package_id(cpu); - int ret; - - if (test_and_set_bit(pkg, packages)) - continue; - /* - * The first online cpu of each package allocates and takes - * the refcounts for all other online cpus in that package. - * If msrs are not enabled no allocation is required. - */ - if (msr) { - ret = uncore_cpu_prepare(cpu); - if (ret) - return ret; - } - uncore_event_init_cpu(cpu); - smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1); - } - __register_cpu_notifier(&uncore_cpu_nb); - return 0; -} - #define X86_UNCORE_MODEL_MATCH(model, init) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } @@ -1440,11 +1378,33 @@ static int __init intel_uncore_init(void) if (cret && pret) return -ENODEV; - cpu_notifier_register_begin(); - ret = uncore_cpumask_init(!cret); - if (ret) - goto err; - cpu_notifier_register_done(); + /* + * Install callbacks. Core will call them for each online cpu. + * + * The first online cpu of each package allocates and takes + * the refcounts for all other online cpus in that package. + * If msrs are not enabled no allocation is required and + * uncore_cpu_prepare() is not called for each online cpu. + */ + if (!cret) { + ret = cpuhp_setup_state(CPUHP_PERF_X86_UNCORE_PREP, + "PERF_X86_UNCORE_PREP", + uncore_cpu_prepare, NULL); + if (ret) + goto err; + } else { + cpuhp_setup_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP, + "PERF_X86_UNCORE_PREP", + uncore_cpu_prepare, NULL); + } + first_init = 1; + cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_STARTING, + "AP_PERF_X86_UNCORE_STARTING", + uncore_cpu_starting, uncore_cpu_dying); + first_init = 0; + cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + "AP_PERF_X86_UNCORE_ONLINE", + uncore_event_cpu_online, uncore_event_cpu_offline); return 0; err: @@ -1452,17 +1412,16 @@ err: on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); - cpu_notifier_register_done(); return ret; } module_init(intel_uncore_init); static void __exit intel_uncore_exit(void) { - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&uncore_cpu_nb); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_UNCORE_STARTING); + cpuhp_remove_state_nocalls(CPUHP_PERF_X86_UNCORE_PREP); uncore_types_exit(uncore_msr_uncores); uncore_pci_exit(); - cpu_notifier_register_done(); } module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index d6063e438158..78b9c23e2d8d 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -1,4 +1,3 @@ -#include <linux/module.h> #include <linux/slab.h> #include <linux/pci.h> #include <asm/apicdef.h> diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 97a69dbba649..9d35ec0cb8fc 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void snb_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); +} + static void snb_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = { static struct intel_uncore_ops snb_uncore_msr_ops = { .init_box = snb_uncore_msr_init_box, + .enable_box = snb_uncore_msr_enable_box, .exit_box = snb_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, @@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box) } } +static void skl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL); +} + static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) { if (box->pmu->pmu_idx == 0) @@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box) static struct intel_uncore_ops skl_uncore_msr_ops = { .init_box = skl_uncore_msr_init_box, + .enable_box = skl_uncore_msr_enable_box, .exit_box = skl_uncore_msr_exit_box, .disable_event = snb_uncore_msr_disable_event, .enable_event = snb_uncore_msr_enable_event, diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 824e54086e07..8aee83bcf71f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -2626,7 +2626,7 @@ void hswep_uncore_cpu_init(void) static struct intel_uncore_type hswep_uncore_ha = { .name = "ha", - .num_counters = 5, + .num_counters = 4, .num_boxes = 2, .perf_ctr_bits = 48, SNBEP_UNCORE_PCI_COMMON_INIT(), @@ -2645,7 +2645,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = { static struct intel_uncore_type hswep_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, @@ -2691,7 +2691,7 @@ static struct intel_uncore_type hswep_uncore_irp = { static struct intel_uncore_type hswep_uncore_qpi = { .name = "qpi", - .num_counters = 5, + .num_counters = 4, .num_boxes = 3, .perf_ctr_bits = 48, .perf_ctr = SNBEP_PCI_PMON_CTR0, @@ -2773,7 +2773,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = { static struct intel_uncore_type hswep_uncore_r3qpi = { .name = "r3qpi", - .num_counters = 4, + .num_counters = 3, .num_boxes = 3, .perf_ctr_bits = 44, .constraints = hswep_uncore_r3qpi_constraints, @@ -2972,7 +2972,7 @@ static struct intel_uncore_type bdx_uncore_ha = { static struct intel_uncore_type bdx_uncore_imc = { .name = "imc", - .num_counters = 5, + .num_counters = 4, .num_boxes = 8, .perf_ctr_bits = 48, .fixed_ctr_bits = 48, diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 50b3a056f96b..4bb3ec69e8ea 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -36,11 +36,11 @@ static bool test_intel(int idx) switch (boot_cpu_data.x86_model) { case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: case INTEL_FAM6_NEHALEM_EP: case INTEL_FAM6_NEHALEM_EX: case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_WESTMERE2: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5befd4945f2..124357773ffa 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -135,6 +135,7 @@ extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); +extern void lapic_update_tsc_freq(void); extern int APIC_init_uniprocessor(void); #ifdef CONFIG_X86_64 @@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } # define setup_boot_APIC_clock x86_init_noop # define setup_secondary_APIC_clock x86_init_noop +static inline void lapic_update_tsc_freq(void) { } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 483fb547e3c0..1d2b69fc0ceb 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -49,43 +49,59 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ test_bit(bit, (unsigned long *)((c)->x86_capability)) -#define REQUIRED_MASK_BIT_SET(bit) \ - ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \ - (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \ - (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \ - (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \ - (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \ - (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \ - (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \ - (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \ - (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \ - (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \ - (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \ - (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \ - (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \ - (((bit)>>5)==15 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \ - (((bit)>>5)==16 && (1UL<<((bit)&31) & REQUIRED_MASK16)) ) - -#define DISABLED_MASK_BIT_SET(bit) \ - ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \ - (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \ - (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \ - (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \ - (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \ - (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \ - (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \ - (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \ - (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \ - (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \ - (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \ - (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \ - (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \ - (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \ - (((bit)>>5)==15 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \ - (((bit)>>5)==16 && (1UL<<((bit)&31) & DISABLED_MASK16)) ) +/* + * There are 32 bits/features in each mask word. The high bits + * (selected with (bit>>5) give us the word number and the low 5 + * bits give us the bit/feature number inside the word. + * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can + * see if it is set in the mask word. + */ +#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ + (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) + +#define REQUIRED_MASK_BIT_SET(feature_bit) \ + ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ + REQUIRED_MASK_CHECK || \ + BUILD_BUG_ON_ZERO(NCAPINTS != 18)) + +#define DISABLED_MASK_BIT_SET(feature_bit) \ + ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ + DISABLED_MASK_CHECK || \ + BUILD_BUG_ON_ZERO(NCAPINTS != 18)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c64b1e9c5d1a..92a8308b96f6 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -225,7 +225,6 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ @@ -310,5 +309,5 @@ #endif #define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ - +#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 911e9358ceb1..85599ad4d024 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -56,5 +56,7 @@ #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) +#define DISABLED_MASK17 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 3a27b93e6261..44461626830e 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -9,7 +9,6 @@ #include <linux/kmemcheck.h> #include <linux/scatterlist.h> #include <linux/dma-debug.h> -#include <linux/dma-attrs.h> #include <asm/io.h> #include <asm/swiotlb.h> #include <linux/dma-contiguous.h> @@ -48,11 +47,11 @@ extern int dma_supported(struct device *hwdev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, - struct dma_attrs *attrs); + unsigned long attrs); extern void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, - struct dma_attrs *attrs); + unsigned long attrs); #ifdef CONFIG_X86_DMA_REMAP /* Platform code defines bridge-specific code */ extern bool dma_capable(struct device *dev, dma_addr_t addr, size_t size); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index fea7724141a0..e7f155c3045e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -344,8 +344,8 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, */ static inline int mmap_is_ia32(void) { - return config_enabled(CONFIG_X86_32) || - (config_enabled(CONFIG_COMPAT) && + return IS_ENABLED(CONFIG_X86_32) || + (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)); } diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 116b58347501..2737366ea583 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -137,9 +137,9 @@ static inline int copy_fregs_to_user(struct fregs_state __user *fx) static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) { - if (config_enabled(CONFIG_X86_32)) + if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) + else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); /* See comment in copy_fxregs_to_kernel() below. */ @@ -150,10 +150,10 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { int err; - if (config_enabled(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_X86_32)) { err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { - if (config_enabled(CONFIG_AS_FXSAVEQ)) { + if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ @@ -166,9 +166,9 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { - if (config_enabled(CONFIG_X86_32)) + if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) + else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); /* See comment in copy_fxregs_to_kernel() below. */ @@ -190,9 +190,9 @@ static inline int copy_user_to_fregs(struct fregs_state __user *fx) static inline void copy_fxregs_to_kernel(struct fpu *fpu) { - if (config_enabled(CONFIG_X86_32)) + if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); - else if (config_enabled(CONFIG_AS_FXSAVEQ)) + else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 7178043b0e1d..59405a248fc2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -22,10 +22,6 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; - /* - * irq_tlb_count is double-counted in irq_call_count, so it must be - * subtracted from irq_call_count when displaying irq_call_count - */ unsigned int irq_tlb_count; #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 223042086f4e..737da62bfeb0 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,10 +5,10 @@ struct x86_mapping_info { void *(*alloc_pgt_page)(void *); /* allocate buf for page table */ void *context; /* context for alloc_pgt_page */ unsigned long pmd_flag; /* page flag for PMD entry */ - bool kernel_mapping; /* kernel mapping or ident mapping */ + unsigned long offset; /* ident mapping offset */ }; int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end); + unsigned long pstart, unsigned long pend); #endif /* _ASM_X86_INIT_H */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 6999f7d01a0d..627719475457 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -8,7 +8,7 @@ * "Extreme" ones, like Broadwell-E. * * Things ending in "2" are usually because we have no better - * name for them. There's no processor called "WESTMERE2". + * name for them. There's no processor called "SILVERMONT2". */ #define INTEL_FAM6_CORE_YONAH 0x0E @@ -18,10 +18,10 @@ #define INTEL_FAM6_CORE2_DUNNINGTON 0x1D #define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */ #define INTEL_FAM6_NEHALEM_EP 0x1A #define INTEL_FAM6_NEHALEM_EX 0x2E #define INTEL_FAM6_WESTMERE 0x25 -#define INTEL_FAM6_WESTMERE2 0x1F #define INTEL_FAM6_WESTMERE_EP 0x2C #define INTEL_FAM6_WESTMERE_EX 0x2F diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69e62862b622..33ae3a4d0159 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,8 +35,9 @@ #include <asm/asm.h> #include <asm/kvm_page_track.h> -#define KVM_MAX_VCPUS 255 -#define KVM_SOFT_MAX_VCPUS 160 +#define KVM_MAX_VCPUS 288 +#define KVM_SOFT_MAX_VCPUS 240 +#define KVM_MAX_VCPU_ID 1023 #define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -599,6 +600,7 @@ struct kvm_vcpu_arch { u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; + u64 mcg_ext_ctl; u64 *mce_banks; /* Cache MMIO info */ @@ -682,9 +684,12 @@ struct kvm_arch_memory_slot { struct kvm_apic_map { struct rcu_head rcu; u8 mode; - struct kvm_lapic *phys_map[256]; - /* first index is cluster id second is cpu id in a cluster */ - struct kvm_lapic *logical_map[16][16]; + u32 max_apic_id; + union { + struct kvm_lapic *xapic_flat_map[8]; + struct kvm_lapic *xapic_cluster_map[16][4]; + }; + struct kvm_lapic *phys_map[]; }; /* Hyper-V emulation context */ @@ -779,6 +784,9 @@ struct kvm_arch { u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + + bool x2apic_format; + bool x2apic_broadcast_quirk_disabled; }; struct kvm_vm_stat { @@ -1006,6 +1014,11 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); + + int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc); + void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); + + void (*setup_mce)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { @@ -1026,7 +1039,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, @@ -1077,6 +1090,10 @@ extern u32 kvm_max_guest_tsc_khz; extern u8 kvm_tsc_scaling_ratio_frac_bits; /* maximum allowed value of TSC scaling ratio */ extern u64 kvm_max_tsc_scaling_ratio; +/* 1ull << kvm_tsc_scaling_ratio_frac_bits */ +extern u64 kvm_default_tsc_scaling_ratio; + +extern u64 kvm_mce_cap_supported; enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -1352,7 +1369,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index a7f9181f63f3..ed80003ce3e2 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -22,7 +22,6 @@ #define _ASM_X86_LIVEPATCH_H #include <asm/setup.h> -#include <linux/module.h> #include <linux/ftrace.h> static inline int klp_check_compiler_support(void) diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0f555cc31984..24acd9ba7837 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -6,7 +6,6 @@ #include <asm/io.h> #include <asm/processor.h> -#include <linux/mc146818rtc.h> #ifndef RTC_PORT #define RTC_PORT(x) (0x70 + (x)) diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 9d3a96c4da78..da0d81fa0b54 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -133,40 +133,14 @@ static inline unsigned int x86_cpuid_family(void) #ifdef CONFIG_MICROCODE extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); -extern int __init save_microcode_in_initrd(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } -static inline int __init save_microcode_in_initrd(void) { return 0; } static inline void reload_early_microcode(void) { } static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name) { return false; } #endif -static inline unsigned long get_initrd_start(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - return initrd_start; -#else - return 0; -#endif -} - -static inline unsigned long get_initrd_start_addr(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD -#ifdef CONFIG_X86_32 - unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); - - return (unsigned long)__pa_nodebug(*initrd_start_p); -#else - return get_initrd_start(); -#endif -#else /* CONFIG_BLK_DEV_INITRD */ - return 0; -#endif -} - #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index adfc847a395e..15eb75484cc0 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -62,7 +62,6 @@ extern int apply_microcode_amd(int cpu); extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); #define PATCH_MAX_SIZE PAGE_SIZE -extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; #ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 603417f8dd6c..5e69154c9f07 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -70,9 +70,4 @@ static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; static inline void reload_ucode_intel(void) {} #endif -#ifdef CONFIG_HOTPLUG_CPU -extern int save_mc_for_early(u8 *mc); -#else -static inline int save_mc_for_early(u8 *mc) { return 0; } -#endif #endif /* _ASM_X86_MICROCODE_INTEL_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 396348196aa7..d8abfcf524d1 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -155,7 +155,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm) #ifdef CONFIG_X86_64 static inline bool is_64bit_mm(struct mm_struct *mm) { - return !config_enabled(CONFIG_IA32_EMULATION) || + return !IS_ENABLED(CONFIG_IA32_EMULATION) || !(mm->context.ia32_compat == TIF_IA32); } #else diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 0deeb2d26df7..f37f2d8a2989 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -97,7 +97,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx) */ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!current_set_polling_and_test()) { + if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) { if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) { mb(); clflush((void *)¤t_thread_info()->flags); diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 3a52ee0e726d..3bae4969ac65 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -13,7 +13,8 @@ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G * and CONFIG_HIGHMEM64G options in the kernel configuration. */ -#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL) +#define __PAGE_OFFSET __PAGE_OFFSET_BASE #define __START_KERNEL_map __PAGE_OFFSET diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 7e8ec7ae10fa..1cc82ece9ac1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names - * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 /* Place the offset above the type: */ -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index fbc5e92e1ecc..643eba42d620 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -26,13 +26,11 @@ * @n: length of the copy in bytes * * Copy data to persistent memory media via non-temporal stores so that - * a subsequent arch_wmb_pmem() can flush cpu and memory controller - * write buffers to guarantee durability. + * a subsequent pmem driver flush operation will drain posted write queues. */ -static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, - size_t n) +static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) { - int unwritten; + int rem; /* * We are copying between two kernel buffers, if @@ -40,59 +38,36 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, * fault) we would have already reported a general protection fault * before the WARN+BUG. */ - unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, - (void __user *) src, n); - if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, unwritten)) + rem = __copy_from_user_inatomic_nocache(dst, (void __user *) src, n); + if (WARN(rem, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, rem)) BUG(); } -static inline int arch_memcpy_from_pmem(void *dst, const void __pmem *src, - size_t n) +static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) { if (static_cpu_has(X86_FEATURE_MCE_RECOVERY)) - return memcpy_mcsafe(dst, (void __force *) src, n); - memcpy(dst, (void __force *) src, n); + return memcpy_mcsafe(dst, src, n); + memcpy(dst, src, n); return 0; } /** - * arch_wmb_pmem - synchronize writes to persistent memory - * - * After a series of arch_memcpy_to_pmem() operations this drains data - * from cpu write buffers and any platform (memory controller) buffers - * to ensure that written data is durable on persistent memory media. - */ -static inline void arch_wmb_pmem(void) -{ - /* - * wmb() to 'sfence' all previous writes such that they are - * architecturally visible to 'pcommit'. Note, that we've - * already arranged for pmem writes to avoid the cache via - * arch_memcpy_to_pmem(). - */ - wmb(); - pcommit_sfence(); -} - -/** * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) - * instruction. This function requires explicit ordering with an - * arch_wmb_pmem() call. + * instruction. */ -static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) +static inline void arch_wb_cache_pmem(void *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; - void *vaddr = (void __force *)addr; - void *vend = vaddr + size; + void *vend = addr + size; void *p; - for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + for (p = (void *)((unsigned long)addr & ~clflush_mask); p < vend; p += x86_clflush_size) clwb(p); } @@ -113,16 +88,14 @@ static inline bool __iter_needs_pmem_wb(struct iov_iter *i) * @i: iterator with source data * * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. - * This function requires explicit ordering with an arch_wmb_pmem() call. */ -static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, +static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, struct iov_iter *i) { - void *vaddr = (void __force *)addr; size_t len; /* TODO: skip the write-back by always using non-temporal stores */ - len = copy_from_iter_nocache(vaddr, bytes, i); + len = copy_from_iter_nocache(addr, bytes, i); if (__iter_needs_pmem_wb(i)) arch_wb_cache_pmem(addr, bytes); @@ -136,28 +109,16 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, * @size: number of bytes to zero * * Write zeros into the memory range starting at 'addr' for 'size' bytes. - * This function requires explicit ordering with an arch_wmb_pmem() call. */ -static inline void arch_clear_pmem(void __pmem *addr, size_t size) +static inline void arch_clear_pmem(void *addr, size_t size) { - void *vaddr = (void __force *)addr; - - memset(vaddr, 0, size); + memset(addr, 0, size); arch_wb_cache_pmem(addr, size); } -static inline void arch_invalidate_pmem(void __pmem *addr, size_t size) +static inline void arch_invalidate_pmem(void *addr, size_t size) { - clflush_cache_range((void __force *) addr, size); -} - -static inline bool __arch_has_wmb_pmem(void) -{ - /* - * We require that wmb() be an 'sfence', that is only guaranteed on - * 64-bit builds - */ - return static_cpu_has(X86_FEATURE_PCOMMIT); + clflush_cache_range(addr, size); } #endif /* CONFIG_ARCH_HAS_PMEM_API */ #endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6271281f947d..2b5d686ea9f3 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -83,12 +83,6 @@ extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code, int si_code); -extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch); -extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, - unsigned long phase1_result); - -extern long syscall_trace_enter(struct pt_regs *); - static inline unsigned long regs_return_value(struct pt_regs *regs) { return regs->ax; diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7c1c89598688..d019f0cc80ec 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -25,6 +25,24 @@ void pvclock_resume(void); void pvclock_touch_watchdogs(void); +static __always_inline +unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src) +{ + unsigned version = src->version & ~1; + /* Make sure that the version is read before the data. */ + virt_rmb(); + return version; +} + +static __always_inline +bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src, + unsigned version) +{ + /* Make sure that the version is re-read after the data. */ + virt_rmb(); + return unlikely(version != src->version); +} + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. @@ -69,23 +87,12 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) } static __always_inline -unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, - cycle_t *cycles, u8 *flags) +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) { - unsigned version; - cycle_t offset; - u64 delta; - - version = src->version; - /* Make the latest version visible */ - smp_rmb(); - - delta = rdtsc_ordered() - src->tsc_timestamp; - offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, - src->tsc_shift); - *cycles = src->system_time + offset; - *flags = src->flags; - return version; + u64 delta = rdtsc_ordered() - src->tsc_timestamp; + cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, + src->tsc_shift); + return src->system_time + offset; } struct pvclock_vsyscall_time_info { diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 9c6b890d5e7a..b2988c0ed829 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -58,7 +58,15 @@ extern unsigned char boot_gdt[]; extern unsigned char secondary_startup_64[]; #endif +static inline size_t real_mode_size_needed(void) +{ + if (real_mode_header) + return 0; /* already allocated. */ + + return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE); +} + +void set_real_mode_mem(phys_addr_t mem, size_t size); void reserve_real_mode(void); -void setup_real_mode(void); #endif /* _ARCH_X86_REALMODE_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 4916144e3c42..fac9a5c0abe9 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -99,5 +99,7 @@ #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 #define REQUIRED_MASK16 0 +#define REQUIRED_MASK17 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h deleted file mode 100644 index f71c3b0ed360..000000000000 --- a/arch/x86/include/asm/rtc.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/rtc.h> diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index d96d04377765..587d7914ea4b 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -253,52 +253,6 @@ static inline void clwb(volatile void *__p) : [pax] "a" (p)); } -/** - * pcommit_sfence() - persistent commit and fence - * - * The PCOMMIT instruction ensures that data that has been flushed from the - * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to - * memory and is durable on the DIMM. The primary use case for this is - * persistent memory. - * - * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT - * with appropriate fencing. - * - * Example: - * void flush_and_commit_buffer(void *vaddr, unsigned int size) - * { - * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; - * void *vend = vaddr + size; - * void *p; - * - * for (p = (void *)((unsigned long)vaddr & ~clflush_mask); - * p < vend; p += boot_cpu_data.x86_clflush_size) - * clwb(p); - * - * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes - * // MFENCE via mb() also works - * wmb(); - * - * // PCOMMIT and the required SFENCE for ordering - * pcommit_sfence(); - * } - * - * After this function completes the data pointed to by 'vaddr' has been - * accepted to memory and will be durable if the 'vaddr' points to persistent - * memory. - * - * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify - * things we include both the PCOMMIT and the required SFENCE in the - * alternatives generated by pcommit_sfence(). - */ -static inline void pcommit_sfence(void) -{ - alternative(ASM_NOP7, - ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ - "sfence", - X86_FEATURE_PCOMMIT); -} - #define nop() asm volatile ("nop") diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index d0fe23ec7e98..14824fc78f7e 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb { struct vmcb_save_area save; }; -#define SVM_CPUID_FEATURE_SHIFT 2 #define SVM_CPUID_FUNC 0x8000000a #define SVM_VM_CR_SVM_DISABLE 4 diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index ab05d73e2bb7..d2f69b9ff732 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -31,9 +31,9 @@ static inline void dma_mark_clean(void *addr, size_t size) {} extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs); + unsigned long attrs); extern void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, - struct dma_attrs *attrs); + unsigned long attrs); #endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 999b7cd2e78c..4e23dd15c661 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task, * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ - if (task_thread_info(task)->status & TS_COMPAT) + if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. @@ -239,9 +239,6 @@ static inline int syscall_get_arch(void) * TS_COMPAT is set for 32-bit syscall entry and then * remains set until we return to user mode. * - * TIF_IA32 tasks should always have TS_COMPAT set at - * system call time. - * * x32 tasks should be considered AUDIT_ARCH_X86_64. */ if (task_thread_info(current)->status & TS_COMPAT) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 89bff044a6f5..8b7c8d8e0852 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -176,6 +176,50 @@ static inline unsigned long current_stack_pointer(void) return sp; } +/* + * Walks up the stack frames to make sure that the specified object is + * entirely contained by a single stack frame. + * + * Returns: + * 1 if within a frame + * -1 if placed across a frame boundary (or outside stack) + * 0 unable to determine (no frame pointers, etc) + */ +static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, + const void *obj, unsigned long len) +{ +#if defined(CONFIG_FRAME_POINTER) + const void *frame = NULL; + const void *oldframe; + + oldframe = __builtin_frame_address(1); + if (oldframe) + frame = __builtin_frame_address(2); + /* + * low ----------------------------------------------> high + * [saved bp][saved ip][args][local vars][saved bp][saved ip] + * ^----------------^ + * allow copies only within here + */ + while (stack <= frame && frame < stackend) { + /* + * If obj + len extends past the last frame, this + * check won't pass and the next frame will be 0, + * causing us to bail out and correctly report + * the copy as invalid. + */ + if (obj + len <= frame) + return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1; + oldframe = frame; + frame = *(const void * const *)frame; + } + return -1; +#else + return 0; +#endif +} + #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 @@ -219,32 +263,11 @@ static inline unsigned long current_stack_pointer(void) * have to worry about atomic accesses. */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ +#ifdef CONFIG_COMPAT +#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ +#endif #ifndef __ASSEMBLY__ -#define HAVE_SET_RESTORE_SIGMASK 1 -static inline void set_restore_sigmask(void) -{ - struct thread_info *ti = current_thread_info(); - ti->status |= TS_RESTORE_SIGMASK; - WARN_ON(!test_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags)); -} -static inline void clear_restore_sigmask(void) -{ - current_thread_info()->status &= ~TS_RESTORE_SIGMASK; -} -static inline bool test_restore_sigmask(void) -{ - return current_thread_info()->status & TS_RESTORE_SIGMASK; -} -static inline bool test_and_clear_restore_sigmask(void) -{ - struct thread_info *ti = current_thread_info(); - if (!(ti->status & TS_RESTORE_SIGMASK)) - return false; - ti->status &= ~TS_RESTORE_SIGMASK; - return true; -} static inline bool in_ia32_syscall(void) { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 4e5be94e079a..6fa85944af83 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) static inline void __native_flush_tlb(void) { + /* + * If current->mm == NULL then we borrow a mm which may change during a + * task switch and therefore we must not be preempted while we write CR3 + * back: + */ + preempt_disable(); native_write_cr3(native_read_cr3()); + preempt_enable(); } static inline void __native_flush_tlb_global_irq_disabled(void) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 43e87a3dd95c..cf75871d2f81 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -36,6 +36,7 @@ #include <linux/cpumask.h> #include <asm/mpspec.h> +#include <asm/percpu.h> /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c03bfb68c503..2131c4ce7d8a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -433,7 +433,11 @@ do { \ #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ asm volatile("1: mov"itype" %1,%"rtype"0\n" \ "2:\n" \ - _ASM_EXTABLE_EX(1b, 2b) \ + ".section .fixup,\"ax\"\n" \ + "3:xor"itype" %"rtype"0,%"rtype"0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_EX(1b, 3b) \ : ltype(x) : "m" (__m(addr))) #define __put_user_nocheck(x, ptr, size) \ @@ -697,44 +701,15 @@ unsigned long __must_check _copy_from_user(void *to, const void __user *from, unsigned long __must_check _copy_to_user(void __user *to, const void *from, unsigned n); -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS -# define copy_user_diag __compiletime_error -#else -# define copy_user_diag __compiletime_warning -#endif - -extern void copy_user_diag("copy_from_user() buffer size is too small") -copy_from_user_overflow(void); -extern void copy_user_diag("copy_to_user() buffer size is too small") -copy_to_user_overflow(void) __asm__("copy_from_user_overflow"); - -#undef copy_user_diag - -#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS - -extern void -__compiletime_warning("copy_from_user() buffer size is not provably correct") -__copy_from_user_overflow(void) __asm__("copy_from_user_overflow"); -#define __copy_from_user_overflow(size, count) __copy_from_user_overflow() - -extern void -__compiletime_warning("copy_to_user() buffer size is not provably correct") -__copy_to_user_overflow(void) __asm__("copy_from_user_overflow"); -#define __copy_to_user_overflow(size, count) __copy_to_user_overflow() - -#else +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); -static inline void -__copy_from_user_overflow(int size, unsigned long count) +static inline void copy_user_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -#define __copy_to_user_overflow __copy_from_user_overflow - -#endif - -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -743,35 +718,18 @@ copy_from_user(void *to, const void __user *from, unsigned long n) kasan_check_write(to, n); - /* - * While we would like to have the compiler do the checking for us - * even in the non-constant size case, any false positives there are - * a problem (especially when DEBUG_STRICT_USER_COPY_CHECKS, but even - * without - the [hopefully] dangerous looking nature of the warning - * would make people go look at the respecitive call sites over and - * over again just to find that there's no problem). - * - * And there are cases where it's just not realistic for the compiler - * to prove the count to be in range. For example when multiple call - * sites of a helper function - perhaps in different source files - - * all doing proper range checking, yet the helper function not doing - * so again. - * - * Therefore limit the compile time checking to the constant size - * case, and do only runtime checking for non-constant sizes. - */ - - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); n = _copy_from_user(to, from, n); - else if(__builtin_constant_p(n)) - copy_from_user_overflow(); + } else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); else - __copy_from_user_overflow(sz, n); + __bad_copy_user(); return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); @@ -780,20 +738,17 @@ copy_to_user(void __user *to, const void *from, unsigned long n) might_fault(); - /* See the comment in copy_from_user() above. */ - if (likely(sz < 0 || sz >= n)) + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); n = _copy_to_user(to, from, n); - else if(__builtin_constant_p(n)) - copy_to_user_overflow(); + } else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); else - __copy_to_user_overflow(sz, n); + __bad_copy_user(); return n; } -#undef __copy_from_user_overflow -#undef __copy_to_user_overflow - /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -812,21 +767,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n) #define user_access_begin() __uaccess_begin() #define user_access_end() __uaccess_end() -#define unsafe_put_user(x, ptr) \ -({ \ +#define unsafe_put_user(x, ptr, err_label) \ +do { \ int __pu_err; \ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \ - __builtin_expect(__pu_err, 0); \ -}) + if (unlikely(__pu_err)) goto err_label; \ +} while (0) -#define unsafe_get_user(x, ptr) \ -({ \ +#define unsafe_get_user(x, ptr, err_label) \ +do { \ int __gu_err; \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __builtin_expect(__gu_err, 0); \ -}) + if (unlikely(__gu_err)) goto err_label; \ +} while (0) #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 4b32da24faaf..7d3bdd1ed697 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -37,6 +37,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { + check_object_size(from, n, true); return __copy_to_user_ll(to, from, n); } @@ -95,6 +96,7 @@ static __always_inline unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { might_fault(); + check_object_size(to, n, false); if (__builtin_constant_p(n)) { unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 2eac2aa3e37f..673059a109fe 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -54,6 +54,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size) { int ret = 0; + check_object_size(dst, size, false); if (!__builtin_constant_p(size)) return copy_user_generic(dst, (__force void *)src, size); switch (size) { @@ -119,6 +120,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size) { int ret = 0; + check_object_size(src, size, true); if (!__builtin_constant_p(size)) return copy_user_generic((__force void *)dst, src, size); switch (size) { diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index c852590254d5..e652a7cc6186 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -79,7 +79,7 @@ struct uv_gam_range_entry { u16 nasid; /* HNasid */ u16 sockid; /* Socket ID, high bits of APIC ID */ u16 pnode; /* Index to MMR and GRU spaces */ - u32 pxm; /* ACPI proximity domain number */ + u32 unused2; u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ }; @@ -88,7 +88,8 @@ struct uv_gam_range_entry { #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ #define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ #define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ -#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 +#define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */ +#define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3 #define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ #define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index cce9ee68e335..0116b2ee9e64 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void) */ static inline int cpu_has_svm(const char **msg) { - uint32_t eax, ebx, ecx, edx; - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { if (msg) *msg = "not amd"; return 0; } - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { + if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) { if (msg) *msg = "can't execute cpuid_8000000a"; return 0; } - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (!boot_cpu_has(X86_FEATURE_SVM)) { if (msg) *msg = "svm not available"; return 0; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 14c63c7e8337..a002b07a7099 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -72,7 +72,6 @@ #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 -#define SECONDARY_EXEC_PCOMMIT 0x00200000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h index acd844c017d3..f02f025ff988 100644 --- a/arch/x86/include/asm/xen/page-coherent.h +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -2,12 +2,11 @@ #define _ASM_X86_XEN_PAGE_COHERENT_H #include <asm/page.h> -#include <linux/dma-attrs.h> #include <linux/dma-mapping.h> static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) + unsigned long attrs) { void *vstart = (void*)__get_free_pages(flags, get_order(size)); *dma_handle = virt_to_phys(vstart); @@ -16,18 +15,18 @@ static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, void *cpu_addr, dma_addr_t dma_handle, - struct dma_attrs *attrs) + unsigned long attrs) { free_pages((unsigned long) cpu_addr, get_order(size)); } static inline void xen_dma_map_page(struct device *hwdev, struct page *page, dma_addr_t dev_addr, unsigned long offset, size_t size, - enum dma_data_direction dir, struct dma_attrs *attrs) { } + enum dma_data_direction dir, unsigned long attrs) { } static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) { } + unsigned long attrs) { } static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t handle, size_t size, enum dma_data_direction dir) { } diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 5b15d94a33f8..37fee272618f 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -78,7 +78,6 @@ #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 -#define EXIT_REASON_PCOMMIT 65 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -127,8 +126,7 @@ { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ - { EXIT_REASON_XRSTORS, "XRSTORS" }, \ - { EXIT_REASON_PCOMMIT, "PCOMMIT" } + { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6738e5c82cca..90d84c3eee53 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -28,7 +28,7 @@ #include <linux/acpi_pmtmr.h> #include <linux/efi.h> #include <linux/cpumask.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/dmi.h> #include <linux/irq.h> #include <linux/slab.h> diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 4b28159e0421..bdfad642123f 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -5,7 +5,7 @@ */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/acpi.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 8e3842fc8bea..63ff468a7986 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -20,7 +20,6 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/pci.h> -#include <linux/module.h> #include <linux/topology.h> #include <linux/interrupt.h> #include <linux/bitmap.h> @@ -242,7 +241,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, static dma_addr_t gart_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long bus; phys_addr_t paddr = page_to_phys(page) + offset; @@ -264,7 +263,7 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page, */ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long iommu_page; int npages; @@ -286,7 +285,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, * Wrapper for pci_unmap_single working with scatterlists. */ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s; int i; @@ -294,7 +293,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, for_each_sg(sg, s, nents, i) { if (!s->dma_length || !s->length) break; - gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0); } } @@ -316,7 +315,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_addr) { if (i > 0) - gart_unmap_sg(dev, sg, i, dir, NULL); + gart_unmap_sg(dev, sg, i, dir, 0); nents = 0; sg[0].dma_length = 0; break; @@ -387,7 +386,7 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, * Merge chunks that have page aligned sizes into a continuous mapping. */ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s, *ps, *start_sg, *sgmap; int need = 0, nextneed, i, out, start; @@ -457,7 +456,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, error: flush_gart(); - gart_unmap_sg(dev, sg, out, dir, NULL); + gart_unmap_sg(dev, sg, out, dir, 0); /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { @@ -477,7 +476,7 @@ error: /* allocate and map a coherent mapping */ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag, struct dma_attrs *attrs) + gfp_t flag, unsigned long attrs) { dma_addr_t paddr; unsigned long align_mask; @@ -509,9 +508,9 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, /* free a coherent mapping */ static void gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) + dma_addr_t dma_addr, unsigned long attrs) { - gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index e991d5c8bb3a..4fdf6230d93c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -9,7 +9,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/spinlock.h> #include <asm/amd_nb.h> @@ -219,24 +219,22 @@ int amd_set_subcaches(int cpu, unsigned long mask) return 0; } -static int amd_cache_gart(void) +static void amd_cache_gart(void) { u16 i; - if (!amd_nb_has_feature(AMD_NB_GART)) - return 0; - - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - amd_northbridges.flags &= ~AMD_NB_GART; - return -ENOMEM; - } + if (!amd_nb_has_feature(AMD_NB_GART)) + return; - for (i = 0; i != amd_nb_num(); i++) - pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, - &flush_words[i]); + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); + return; + } - return 0; + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } void amd_flush_garts(void) @@ -278,17 +276,10 @@ EXPORT_SYMBOL_GPL(amd_flush_garts); static __init int init_amd_nbs(void) { - int err = 0; + amd_cache_northbridges(); + amd_cache_gart(); - err = amd_cache_northbridges(); - - if (err < 0) - pr_notice("Cannot enumerate AMD northbridges\n"); - - if (amd_cache_gart() < 0) - pr_notice("Cannot initialize GART flush words, GART support disabled\n"); - - return err; + return 0; } /* This has to go after the PCI subsystem */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index cefacbad1531..456316f6c868 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -215,26 +215,18 @@ void apbt_setup_secondary_clock(void) * cpu timers during the offline process due to the ordering of notification. * the extra interrupt is harmless. */ -static int apbt_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) +static int apbt_cpu_dead(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - dw_apb_clockevent_pause(adev->timer); - if (system_state == SYSTEM_RUNNING) { - pr_debug("skipping APBT CPU %lu offline\n", cpu); - } else { - pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - dw_apb_clockevent_stop(adev->timer); - } - break; - default: - pr_debug("APBT notified %lu, no action\n", action); + dw_apb_clockevent_pause(adev->timer); + if (system_state == SYSTEM_RUNNING) { + pr_debug("skipping APBT CPU %u offline\n", cpu); + } else { + pr_debug("APBT clockevent for cpu %u offline\n", cpu); + dw_apb_clockevent_stop(adev->timer); } - return NOTIFY_OK; + return 0; } static __init int apbt_late_init(void) @@ -242,9 +234,8 @@ static __init int apbt_late_init(void) if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; - /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(apbt_cpuhp_notify, -20); - return 0; + return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL, + apbt_cpu_dead); } fs_initcall(apbt_late_init); #else diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ac8d8ad8b009..f3e9b2df4b16 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -23,7 +23,7 @@ #include <linux/bootmem.h> #include <linux/ftrace.h> #include <linux/ioport.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/timex.h> @@ -147,7 +147,7 @@ static int force_enable_local_apic __initdata; */ static int __init parse_lapic(char *arg) { - if (config_enabled(CONFIG_X86_32) && !arg) + if (IS_ENABLED(CONFIG_X86_32) && !arg) force_enable_local_apic = 1; else if (arg && !strncmp(arg, "notscdeadline", 13)) setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); @@ -313,7 +313,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 -#define TSC_DIVISOR 32 +#define TSC_DIVISOR 8 /* * This function sets up the local APIC timer, with a timeout of @@ -565,13 +565,37 @@ static void setup_APIC_timer(void) CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, + tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); } else clockevents_register_device(levt); } /* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + +/* * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer @@ -1599,6 +1623,9 @@ void __init enable_IR_x2apic(void) unsigned long flags; int ret, ir_stat; + if (skip_ioapic_setup) + return; + ir_stat = irq_remapping_prepare(); if (ir_stat < 0 && !x2apic_supported()) return; @@ -2066,7 +2093,6 @@ int generic_processor_info(int apicid, int version) return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2089,10 +2115,13 @@ int generic_processor_info(int apicid, int version) pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } + num_processors++; + /* * Validate version */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 048747778d37..5b2ae106bd4a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -15,7 +15,7 @@ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/hardirq.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 2cebf59092d8..c05688b2deff 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -11,7 +11,6 @@ #include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 7788ce643bf4..f29501e1a5c1 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,7 +16,7 @@ #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/nmi.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/delay.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f072b9572634..7491f417a8e4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -39,7 +39,7 @@ #include <linux/mc146818rtc.h> #include <linux/compiler.h> #include <linux/acpi.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/syscore_ops.h> #include <linux/freezer.h> #include <linux/kthread.h> diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 2a0f225afebd..3a205d4a12d0 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -8,7 +8,6 @@ #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/cpu.h> -#include <linux/module.h> #include <asm/smp.h> #include <asm/mtrr.h> diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 93edfa01b408..7c43e716c158 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -8,7 +8,7 @@ */ #include <linux/threads.h> #include <linux/cpumask.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/ctype.h> diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1793dba7a741..c303054b90b5 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -11,10 +11,9 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/ctype.h> -#include <linux/init.h> #include <linux/hardirq.h> #include <linux/dmar.h> diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a5e400afc563..6066d945c40e 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -523,7 +523,7 @@ static int apic_set_affinity(struct irq_data *irq_data, struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; - if (!config_enabled(CONFIG_SMP)) + if (!IS_ENABLED(CONFIG_SMP)) return -EPERM; if (!cpumask_intersects(dest, cpu_online_mask)) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 24170d0809ba..54f35d988025 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -152,68 +152,53 @@ static void init_x2apic_ldr(void) } } - /* - * At CPU state changes, update the x2apic cluster sibling info. - */ -static int -update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) +/* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int x2apic_prepare_cpu(unsigned int cpu) { - unsigned int this_cpu = (unsigned long)hcpu; - unsigned int cpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), - GFP_KERNEL)) { - err = -ENOMEM; - } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), - GFP_KERNEL)) { - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - err = -ENOMEM; - } - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); - } - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - free_cpumask_var(per_cpu(ipi_mask, this_cpu)); - break; + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, cpu)); + return -ENOMEM; } - return notifier_from_errno(err); + return 0; } -static struct notifier_block x2apic_cpu_notifier = { - .notifier_call = update_clusterinfo, -}; - -static int x2apic_init_cpu_notifier(void) +static int x2apic_dead_cpu(unsigned int this_cpu) { - int cpu = smp_processor_id(); - - zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); - - BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + int cpu; - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); - register_hotcpu_notifier(&x2apic_cpu_notifier); - return 1; + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + return 0; } static int x2apic_cluster_probe(void) { - if (x2apic_mode) - return x2apic_init_cpu_notifier(); - else + int cpu = smp_processor_id(); + int ret; + + if (!x2apic_mode) + return 0; + + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + x2apic_prepare_cpu, x2apic_dead_cpu); + if (ret < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); return 0; + } + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); + return 1; } static const struct cpumask *x2apic_cluster_target_cpus(void) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 64dd38fbf218..cb0673c1e940 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -12,7 +12,7 @@ #include <linux/proc_fs.h> #include <linux/threads.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + /* Setup early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; @@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void) struct uv_gam_range_entry *gre = uv_gre_table; struct uv_gam_range_s *grt; unsigned long last_limit = 0, ram_limit = 0; - int bytes, i, sid, lsid = -1; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; if (!gre) return; @@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void) } sid = gre->sockid - _min_socket; if (lsid < sid) { /* new range */ - grt = &_gr_table[sid]; - grt->base = lsid; + grt = &_gr_table[indx]; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid = sid; + lindx = indx++; continue; } if (lsid == sid && !ram_limit) { /* update range */ @@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void) } if (!ram_limit) { /* non-contiguous ram range */ grt++; - grt->base = sid - 1; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; @@ -1155,19 +1161,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", - "SID", "PN", "PXM"); + "SID", "PN"); } pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, ((unsigned long)(gre->limit - lgre)) >> (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ - gre->type, gre->nasid, gre->sockid, - gre->pnode, gre->pxm); + gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; if (sock_min > gre->sockid) @@ -1286,7 +1291,7 @@ static void __init build_socket_tables(void) _pnode_to_socket[i] = SOCK_EMPTY; /* fill in pnode/node/addr conversion list values */ - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; @@ -1294,20 +1299,18 @@ static void __init build_socket_tables(void) if (_socket_to_pnode[i] != SOCK_EMPTY) continue; /* duplicate */ _socket_to_pnode[i] = gre->pnode; - _socket_to_node[i] = gre->pxm; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], - _socket_to_node[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* check socket -> node values */ + /* Set socket -> node values */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1318,14 +1321,9 @@ static void __init build_socket_tables(void) lnid = nid; apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - i = sockid - minsock; - - if (nid != _socket_to_node[i]) { - pr_warn( - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", - i, sockid, gre->type, _socket_to_node[i], nid); - _socket_to_node[i] = nid; - } + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); } /* Setup physical blade to pnode translation from GAM Range Table */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f5c69d8974e1..b81fe2d63e15 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -669,6 +669,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } +#define MSR_AMD64_DE_CFG 0xC0011029 + +static void init_amd_ln(struct cpuinfo_x86 *c) +{ + /* + * Apply erratum 665 fix unconditionally so machines without a BIOS + * fix work. + */ + msr_set_bit(MSR_AMD64_DE_CFG, 31); +} + static void init_amd_bd(struct cpuinfo_x86 *c) { u64 value; @@ -726,6 +737,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 6: init_amd_k7(c); break; case 0xf: init_amd_k8(c); break; case 0x10: init_amd_gh(c); break; + case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d22a7b9c4f0e..809eda03c527 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2,7 +2,7 @@ #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/percpu.h> #include <linux/string.h> #include <linux/ctype.h> diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 73d391ae452f..27e46658ebe3 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -21,7 +21,8 @@ * */ -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <asm/processor.h> #include <asm/hypervisor.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c1a89bc026ac..fcd484d2bb03 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -5,7 +5,7 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/thread_info.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/uaccess.h> #include <asm/cpufeature.h> @@ -13,6 +13,7 @@ #include <asm/msr.h> #include <asm/bugs.h> #include <asm/cpu.h> +#include <asm/intel-family.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -508,6 +509,10 @@ static void init_intel(struct cpuinfo_x86 *c) (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && + ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + set_cpu_bug(c, X86_BUG_MONITOR); + #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index fbb5e90557a5..e42117d5f4d7 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,7 +1,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <linux/cpu.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> /** diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8581963894c7..620ab06bcf45 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,26 +54,27 @@ static LIST_HEAD(pcache); */ static u8 *container; static size_t container_size; +static bool ucode_builtin; static u32 ucode_new_rev; -u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; static struct cpio_data ucode_cpio; -/* - * Microcode patch container file is prepended to the initrd in cpio format. - * See Documentation/x86/early-microcode.txt - */ -static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - static struct cpio_data __init find_ucode_in_initrd(void) { - long offset = 0; +#ifdef CONFIG_BLK_DEV_INITRD char *path; void *start; size_t size; + /* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ + static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + #ifdef CONFIG_X86_32 struct boot_params *p; @@ -89,9 +90,12 @@ static struct cpio_data __init find_ucode_in_initrd(void) path = ucode_path; start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); size = boot_params.hdr.ramdisk_size; -#endif +#endif /* !CONFIG_X86_32 */ - return find_cpio_data(path, start, size, &offset); + return find_cpio_data(path, start, size, NULL); +#else + return (struct cpio_data){ NULL, 0, "" }; +#endif } static size_t compute_container_size(u8 *data, u32 total_size) @@ -278,22 +282,26 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; + bool *builtin; void **data; size_t *size; #ifdef CONFIG_X86_32 data = (void **)__pa_nodebug(&ucode_cpio.data); size = (size_t *)__pa_nodebug(&ucode_cpio.size); + builtin = (bool *)__pa_nodebug(&ucode_builtin); #else data = &ucode_cpio.data; size = &ucode_cpio.size; + builtin = &ucode_builtin; #endif - cp = find_ucode_in_initrd(); - if (!cp.data) { - if (!load_builtin_amd_microcode(&cp, family)) - return; - } + *builtin = load_builtin_amd_microcode(&cp, family); + if (!*builtin) + cp = find_ucode_in_initrd(); + + if (!(cp.data && cp.size)) + return; *data = cp.data; *size = cp.size; @@ -352,6 +360,7 @@ void load_ucode_amd_ap(void) unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; + u8 *cont = container; u32 rev, eax; u16 eq_id; @@ -368,8 +377,12 @@ void load_ucode_amd_ap(void) if (check_current_patch_level(&rev, false)) return; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + if (!ucode_builtin) + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -431,6 +444,10 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + if (!ucode_builtin) + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index ac360bfbbdb6..df04b2d033f6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -60,7 +60,6 @@ static bool dis_ucode_ldr; static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -EXPORT_SYMBOL_GPL(ucode_cpu_info); /* * Operations that are run on a target cpu: @@ -175,24 +174,24 @@ void load_ucode_ap(void) } } -int __init save_microcode_in_initrd(void) +static int __init save_microcode_in_initrd(void) { struct cpuinfo_x86 *c = &boot_cpu_data; switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 >= 6) - save_microcode_in_initrd_intel(); + return save_microcode_in_initrd_intel(); break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(); break; default: break; } - return 0; + return -EINVAL; } void reload_early_microcode(void) @@ -691,4 +690,5 @@ int __init microcode_init(void) return error; } +fs_initcall(save_microcode_in_initrd); late_initcall(microcode_init); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 65cbbcd48fe4..cdc0deab00c9 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -40,9 +40,13 @@ #include <asm/msr.h> /* - * Temporary microcode blobs pointers storage. We note here the pointers to - * microcode blobs we've got from whatever storage (detached initrd, builtin). - * Later on, we put those into final storage mc_saved_data.mc_saved. + * Temporary microcode blobs pointers storage. We note here during early load + * the pointers to microcode blobs we've got from whatever storage (detached + * initrd, builtin). Later on, we put those into final storage + * mc_saved_data.mc_saved. + * + * Important: those are offsets from the beginning of initrd or absolute + * addresses within the kernel image when built-in. */ static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; @@ -51,8 +55,15 @@ static struct mc_saved_data { struct microcode_intel **mc_saved; } mc_saved_data; +/* Microcode blobs within the initrd. 0 if builtin. */ +static struct ucode_blobs { + unsigned long start; + bool valid; +} blobs; + +/* Go through saved patches and find the one suitable for the current CPU. */ static enum ucode_state -load_microcode_early(struct microcode_intel **saved, +find_microcode_patch(struct microcode_intel **saved, unsigned int num_saved, struct ucode_cpu_info *uci) { struct microcode_intel *ucode_ptr, *new_mc = NULL; @@ -121,13 +132,13 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, if (!mcs->mc_saved) { copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); - return load_microcode_early(mc_saved_tmp, count, uci); + return find_microcode_patch(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 microcode_phys(mc_saved_tmp, mcs); - return load_microcode_early(mc_saved_tmp, count, uci); + return find_microcode_patch(mc_saved_tmp, count, uci); #else - return load_microcode_early(mcs->mc_saved, count, uci); + return find_microcode_patch(mcs->mc_saved, count, uci); #endif } } @@ -450,8 +461,6 @@ static void show_saved_mc(void) #endif } -#ifdef CONFIG_HOTPLUG_CPU -static DEFINE_MUTEX(x86_cpu_microcode_mutex); /* * Save this mc into mc_saved_data. So it will be loaded early when a CPU is * hot added or resumes. @@ -459,19 +468,18 @@ static DEFINE_MUTEX(x86_cpu_microcode_mutex); * Please make sure this mc should be a valid microcode patch before calling * this function. */ -int save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc) { +#ifdef CONFIG_HOTPLUG_CPU + /* Synchronization during CPU hotplug. */ + static DEFINE_MUTEX(x86_cpu_microcode_mutex); + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; unsigned int num_saved; struct microcode_intel **mc_saved; - int ret = 0; - int i; + int ret, i; - /* - * Hold hotplug lock so mc_saved_data is not accessed by a CPU in - * hotplug. - */ mutex_lock(&x86_cpu_microcode_mutex); mc_saved_count_init = mc_saved_data.num_saved; @@ -509,11 +517,8 @@ int save_mc_for_early(u8 *mc) out: mutex_unlock(&x86_cpu_microcode_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(save_mc_for_early); #endif +} static bool __init load_builtin_intel_microcode(struct cpio_data *cp) { @@ -532,37 +537,6 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) #endif } -static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long start, unsigned long size, - struct ucode_cpu_info *uci) -{ - struct cpio_data cd; - long offset = 0; -#ifdef CONFIG_X86_32 - char *p = (char *)__pa_nodebug(ucode_name); -#else - char *p = ucode_name; -#endif - - cd.data = NULL; - cd.size = 0; - - /* try built-in microcode if no initrd */ - if (!size) { - if (!load_builtin_intel_microcode(&cd)) - return UCODE_ERROR; - } else { - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) - return UCODE_ERROR; - } - - return get_matching_model_microcode(start, cd.data, cd.size, - mcs, mc_ptrs, uci); -} - /* * Print ucode update info. */ @@ -680,38 +654,117 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - int ret = 0; + unsigned int count = mc_saved_data.num_saved; + unsigned long offset = 0; + int ret; if (!count) - return ret; + return 0; + + /* + * We have found a valid initrd but it might've been relocated in the + * meantime so get its updated address. + */ + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) + offset = initrd_start; - copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); + copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); - - show_saved_mc(); + else + show_saved_mc(); return ret; } +static __init enum ucode_state +__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) +{ +#ifdef CONFIG_BLK_DEV_INITRD + static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; + char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) + : ucode_name; +# ifdef CONFIG_X86_32 + unsigned long start = 0, size; + struct boot_params *params; + + params = (struct boot_params *)__pa_nodebug(&boot_params); + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? params->hdr.ramdisk_image : 0); + +# else /* CONFIG_X86_64 */ + unsigned long start = 0, size; + + size = (u64)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (u64)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + *cd = find_cpio_data(p, (void *)start, size, NULL); + if (cd->data) { + blbp->start = start; + blbp->valid = true; + + return UCODE_OK; + } else +#endif /* CONFIG_BLK_DEV_INITRD */ + return UCODE_ERROR; +} + +static __init enum ucode_state +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + struct ucode_cpu_info *uci, struct ucode_blobs *blbp) +{ + struct cpio_data cd = { NULL, 0, "" }; + enum ucode_state ret; + + /* try built-in microcode first */ + if (load_builtin_intel_microcode(&cd)) + /* + * Invalidate blobs as we might've gotten an initrd too, + * supplied by the boot loader, by mistake or simply forgotten + * there. That's fine, we ignore it since we've found builtin + * microcode already. + */ + blbp->valid = false; + else { + ret = __scan_microcode_initrd(&cd, blbp); + if (ret != UCODE_OK) + return ret; + } + + return get_matching_model_microcode(blbp->start, cd.data, cd.size, + mcs, mc_ptrs, uci); +} + static void __init _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long start, unsigned long size) + struct ucode_blobs *blbp) { struct ucode_cpu_info uci; enum ucode_state ret; collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); if (ret != UCODE_OK) return; - ret = load_microcode(mcs, mc_ptrs, start, &uci); + ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); if (ret != UCODE_OK) return; @@ -720,54 +773,60 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, void __init load_ucode_intel_bsp(void) { - u64 start, size; -#ifdef CONFIG_X86_32 - struct boot_params *p; - - p = (struct boot_params *)__pa_nodebug(&boot_params); - size = p->hdr.ramdisk_size; + struct ucode_blobs *blobs_p; + struct mc_saved_data *mcs; + unsigned long *ptrs; - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? p->hdr.ramdisk_image : 0); - - _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), - start, size); +#ifdef CONFIG_X86_32 + mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); + blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); #else - size = boot_params.hdr.ramdisk_size; - start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); - - _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); + mcs = &mc_saved_data; + ptrs = mc_tmp_ptrs; + blobs_p = &blobs; #endif + + _load_ucode_intel_bsp(mcs, ptrs, blobs_p); } void load_ucode_intel_ap(void) { - unsigned long *mcs_tmp_p; - struct mc_saved_data *mcs_p; + struct ucode_blobs *blobs_p; + unsigned long *ptrs, start = 0; + struct mc_saved_data *mcs; struct ucode_cpu_info uci; enum ucode_state ret; -#ifdef CONFIG_X86_32 - mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); +#ifdef CONFIG_X86_32 + mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); + blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); #else - mcs_tmp_p = mc_tmp_ptrs; - mcs_p = &mc_saved_data; + mcs = &mc_saved_data; + ptrs = mc_tmp_ptrs; + blobs_p = &blobs; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (!mcs_p->num_saved) + if (!mcs->num_saved) return; + if (blobs_p->valid) { + start = blobs_p->start; + + /* + * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles + * physmem mapping too and there we have the initrd. + */ + start += PAGE_OFFSET - __PAGE_OFFSET_BASE; + } + collect_cpu_info_early(&uci); - ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); + ret = load_microcode(mcs, ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -784,7 +843,7 @@ void reload_ucode_intel(void) collect_cpu_info_early(&uci); - ret = load_microcode_early(mc_saved_data.mc_saved, + ret = find_microcode_patch(mc_saved_data.mc_saved, mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; @@ -794,6 +853,7 @@ void reload_ucode_intel(void) static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { + static struct cpu_signature prev; struct cpuinfo_x86 *c = &cpu_data(cpu_num); unsigned int val[2]; @@ -808,8 +868,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) } csig->rev = c->microcode; - pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + + /* No extra locking on prev, races are harmless. */ + if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) { + pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n", + csig->sig, csig->pf, csig->rev); + prev = *csig; + } return 0; } @@ -838,6 +903,7 @@ static int apply_microcode_intel(int cpu) struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; unsigned int val[2]; + static int prev_rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -872,11 +938,14 @@ static int apply_microcode_intel(int cpu) return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, val[1], - mc->hdr.date & 0xffff, - mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); + if (val[1] != prev_rev) { + pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", + val[1], + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + prev_rev = val[1]; + } c = &cpu_data(cpu); diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 2ce1a7dc45b7..406cb6c0d9dd 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -141,7 +141,6 @@ int microcode_sanity_check(void *mc, int print_err) } return 0; } -EXPORT_SYMBOL_GPL(microcode_sanity_check); /* * Returns 1 if update has been found, 0 otherwise. @@ -183,4 +182,3 @@ int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) return find_matching_signature(mc, csig, cpf); } -EXPORT_SYMBOL_GPL(has_newer_microcode); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 10c11b4da31d..8f44c5a50ab8 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -13,7 +13,8 @@ #include <linux/types.h> #include <linux/time.h> #include <linux/clocksource.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/hardirq.h> #include <linux/efi.h> #include <linux/interrupt.h> diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 31e951ce6dff..3b442b64c72d 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -17,7 +17,6 @@ * License along with this library; if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <linux/module.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 16e37a2581ac..fdc55215d44d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -4,7 +4,7 @@ */ #define DEBUG -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index d76f13d6d8d6..6d9b45549109 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -2,7 +2,6 @@ #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/proc_fs.h> -#include <linux/module.h> #include <linux/ctype.h> #include <linux/string.h> #include <linux/slab.h> diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7d393ecdeee6..28f1b54b7fad 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -38,7 +38,7 @@ #include <linux/stop_machine.h> #include <linux/kvm_para.h> #include <linux/uaccess.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/sort.h> diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 2e8caf03f593..181eabecae25 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -12,7 +12,7 @@ */ #include <linux/percpu.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 8cac429b6a1d..1ff0598d309c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,7 +22,8 @@ */ #include <linux/dmi.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 9ef978d69c22..9616cf76940c 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -20,7 +20,7 @@ #include <linux/delay.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/vmalloc.h> diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 948d77da3881..09675712eba8 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -7,7 +7,7 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6dede08dd98b..9ee4520ce83c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -7,7 +7,7 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/kexec.h> #include <linux/sysfs.h> diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 57b71373bae3..de7501edb21c 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -237,36 +237,19 @@ static void __init intel_remapping_check(int num, int slot, int func) * despite the efforts of the "RAM buffer" approach, which simply rounds * memory boundaries up to 64M to try to catch space that may decode * as RAM and so is not suitable for MMIO. - * - * And yes, so far on current devices the base addr is always under 4G. */ -static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) -{ - u32 base; - - /* - * For the PCI IDs in this quirk, the stolen base is always - * in 0x5c, aka the BDSM register (yes that's really what - * it's called). - */ - base = read_pci_config(num, slot, func, 0x5c); - base &= ~((1<<20) - 1); - - return base; -} #define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) -#define GB(x) (MB (KB (x))) static size_t __init i830_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; - if (tmp & I830_TSEG_SIZE_1M) + if (esmramc & I830_TSEG_SIZE_1M) return MB(1); else return KB(512); @@ -274,27 +257,26 @@ static size_t __init i830_tseg_size(void) static size_t __init i845_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK; - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; - switch (tmp & I845_TSEG_SIZE_MASK) { - case I845_TSEG_SIZE_512K: - return KB(512); - case I845_TSEG_SIZE_1M: - return MB(1); + switch (tseg_size) { + case I845_TSEG_SIZE_512K: return KB(512); + case I845_TSEG_SIZE_1M: return MB(1); default: - WARN_ON(1); - return 0; + WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc); } + return 0; } static size_t __init i85x_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; return MB(1); @@ -314,285 +296,287 @@ static size_t __init i85x_mem_size(void) * On 830/845/85x the stolen memory base isn't available in any * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. */ -static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i830_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i830_mem_size() - i830_tseg_size() - stolen_size; + return (phys_addr_t)i830_mem_size() - i830_tseg_size() - stolen_size; } -static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i845_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i830_mem_size() - i845_tseg_size() - stolen_size; + return (phys_addr_t)i830_mem_size() - i845_tseg_size() - stolen_size; } -static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i85x_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i85x_mem_size() - i85x_tseg_size() - stolen_size; + return (phys_addr_t)i85x_mem_size() - i85x_tseg_size() - stolen_size; } -static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i865_stolen_base(int num, int slot, int func, + size_t stolen_size) { + u16 toud; + /* * FIXME is the graphics stolen memory region * always at TOUD? Ie. is it always the last * one to be allocated by the BIOS? */ - return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; + toud = read_pci_config_16(0, 0, 0, I865_TOUD); + + return (phys_addr_t)toud << 16; +} + +static phys_addr_t __init gen3_stolen_base(int num, int slot, int func, + size_t stolen_size) +{ + u32 bsm; + + /* Almost universally we can find the Graphics Base of Stolen Memory + * at register BSM (0x5c) in the igfx configuration space. On a few + * (desktop) machines this is also mirrored in the bridge device at + * different locations, or in the MCHBAR. + */ + bsm = read_pci_config(num, slot, func, INTEL_BSM); + + return (phys_addr_t)bsm & INTEL_BSM_MASK; } static size_t __init i830_stolen_size(int num, int slot, int func) { - size_t stolen_size; u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); - - switch (gmch_ctrl & I830_GMCH_GMS_MASK) { - case I830_GMCH_GMS_STOLEN_512: - stolen_size = KB(512); - break; - case I830_GMCH_GMS_STOLEN_1024: - stolen_size = MB(1); - break; - case I830_GMCH_GMS_STOLEN_8192: - stolen_size = MB(8); - break; - case I830_GMCH_GMS_LOCAL: - /* local memory isn't part of the normal address space */ - stolen_size = 0; - break; + gms = gmch_ctrl & I830_GMCH_GMS_MASK; + + switch (gms) { + case I830_GMCH_GMS_STOLEN_512: return KB(512); + case I830_GMCH_GMS_STOLEN_1024: return MB(1); + case I830_GMCH_GMS_STOLEN_8192: return MB(8); + /* local memory isn't part of the normal address space */ + case I830_GMCH_GMS_LOCAL: return 0; default: - return 0; + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); } - return stolen_size; + return 0; } static size_t __init gen3_stolen_size(int num, int slot, int func) { - size_t stolen_size; u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); - - switch (gmch_ctrl & I855_GMCH_GMS_MASK) { - case I855_GMCH_GMS_STOLEN_1M: - stolen_size = MB(1); - break; - case I855_GMCH_GMS_STOLEN_4M: - stolen_size = MB(4); - break; - case I855_GMCH_GMS_STOLEN_8M: - stolen_size = MB(8); - break; - case I855_GMCH_GMS_STOLEN_16M: - stolen_size = MB(16); - break; - case I855_GMCH_GMS_STOLEN_32M: - stolen_size = MB(32); - break; - case I915_GMCH_GMS_STOLEN_48M: - stolen_size = MB(48); - break; - case I915_GMCH_GMS_STOLEN_64M: - stolen_size = MB(64); - break; - case G33_GMCH_GMS_STOLEN_128M: - stolen_size = MB(128); - break; - case G33_GMCH_GMS_STOLEN_256M: - stolen_size = MB(256); - break; - case INTEL_GMCH_GMS_STOLEN_96M: - stolen_size = MB(96); - break; - case INTEL_GMCH_GMS_STOLEN_160M: - stolen_size = MB(160); - break; - case INTEL_GMCH_GMS_STOLEN_224M: - stolen_size = MB(224); - break; - case INTEL_GMCH_GMS_STOLEN_352M: - stolen_size = MB(352); - break; + gms = gmch_ctrl & I855_GMCH_GMS_MASK; + + switch (gms) { + case I855_GMCH_GMS_STOLEN_1M: return MB(1); + case I855_GMCH_GMS_STOLEN_4M: return MB(4); + case I855_GMCH_GMS_STOLEN_8M: return MB(8); + case I855_GMCH_GMS_STOLEN_16M: return MB(16); + case I855_GMCH_GMS_STOLEN_32M: return MB(32); + case I915_GMCH_GMS_STOLEN_48M: return MB(48); + case I915_GMCH_GMS_STOLEN_64M: return MB(64); + case G33_GMCH_GMS_STOLEN_128M: return MB(128); + case G33_GMCH_GMS_STOLEN_256M: return MB(256); + case INTEL_GMCH_GMS_STOLEN_96M: return MB(96); + case INTEL_GMCH_GMS_STOLEN_160M:return MB(160); + case INTEL_GMCH_GMS_STOLEN_224M:return MB(224); + case INTEL_GMCH_GMS_STOLEN_352M:return MB(352); default: - stolen_size = 0; - break; + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); } - return stolen_size; + return 0; } static size_t __init gen6_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; - gmch_ctrl &= SNB_GMCH_GMS_MASK; + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; - return gmch_ctrl << 25; /* 32 MB units */ + return (size_t)gms * MB(32); } static size_t __init gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; - gmch_ctrl &= BDW_GMCH_GMS_MASK; - return gmch_ctrl << 25; /* 32 MB units */ + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + return (size_t)gms * MB(32); } static size_t __init chv_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; - gmch_ctrl &= SNB_GMCH_GMS_MASK; + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; /* * 0x0 to 0x10: 32MB increments starting at 0MB * 0x11 to 0x16: 4MB increments starting at 8MB * 0x17 to 0x1d: 4MB increments start at 36MB */ - if (gmch_ctrl < 0x11) - return gmch_ctrl << 25; - else if (gmch_ctrl < 0x17) - return (gmch_ctrl - 0x11 + 2) << 22; + if (gms < 0x11) + return (size_t)gms * MB(32); + else if (gms < 0x17) + return (size_t)(gms - 0x11 + 2) * MB(4); else - return (gmch_ctrl - 0x17 + 9) << 22; + return (size_t)(gms - 0x17 + 9) * MB(4); } -struct intel_stolen_funcs { - size_t (*size)(int num, int slot, int func); - u32 (*base)(int num, int slot, int func, size_t size); -}; - static size_t __init gen9_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; - gmch_ctrl &= BDW_GMCH_GMS_MASK; + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; - if (gmch_ctrl < 0xf0) - return gmch_ctrl << 25; /* 32 MB units */ + /* 0x0 to 0xef: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if (gms < 0xf0) + return (size_t)gms * MB(32); else - /* 4MB increments starting at 0xf0 for 4MB */ - return (gmch_ctrl - 0xf0 + 1) << 22; + return (size_t)(gms - 0xf0 + 1) * MB(4); } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); +struct intel_early_ops { + size_t (*stolen_size)(int num, int slot, int func); + phys_addr_t (*stolen_base)(int num, int slot, int func, size_t size); +}; -static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { - .base = i830_stolen_base, - .size = i830_stolen_size, +static const struct intel_early_ops i830_early_ops __initconst = { + .stolen_base = i830_stolen_base, + .stolen_size = i830_stolen_size, }; -static const struct intel_stolen_funcs i845_stolen_funcs __initconst = { - .base = i845_stolen_base, - .size = i830_stolen_size, +static const struct intel_early_ops i845_early_ops __initconst = { + .stolen_base = i845_stolen_base, + .stolen_size = i830_stolen_size, }; -static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = { - .base = i85x_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops i85x_early_ops __initconst = { + .stolen_base = i85x_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs i865_stolen_funcs __initconst = { - .base = i865_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops i865_early_ops __initconst = { + .stolen_base = i865_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops gen3_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen6_stolen_size, +static const struct intel_early_ops gen6_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen6_stolen_size, }; -static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen8_stolen_size, +static const struct intel_early_ops gen8_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen8_stolen_size, }; -static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen9_stolen_size, +static const struct intel_early_ops gen9_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen9_stolen_size, }; -static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = chv_stolen_size, +static const struct intel_early_ops chv_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = chv_stolen_size, }; -static const struct pci_device_id intel_stolen_ids[] __initconst = { - INTEL_I830_IDS(&i830_stolen_funcs), - INTEL_I845G_IDS(&i845_stolen_funcs), - INTEL_I85X_IDS(&i85x_stolen_funcs), - INTEL_I865G_IDS(&i865_stolen_funcs), - INTEL_I915G_IDS(&gen3_stolen_funcs), - INTEL_I915GM_IDS(&gen3_stolen_funcs), - INTEL_I945G_IDS(&gen3_stolen_funcs), - INTEL_I945GM_IDS(&gen3_stolen_funcs), - INTEL_VLV_M_IDS(&gen6_stolen_funcs), - INTEL_VLV_D_IDS(&gen6_stolen_funcs), - INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), - INTEL_I965G_IDS(&gen3_stolen_funcs), - INTEL_G33_IDS(&gen3_stolen_funcs), - INTEL_I965GM_IDS(&gen3_stolen_funcs), - INTEL_GM45_IDS(&gen3_stolen_funcs), - INTEL_G45_IDS(&gen3_stolen_funcs), - INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), - INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), - INTEL_SNB_D_IDS(&gen6_stolen_funcs), - INTEL_SNB_M_IDS(&gen6_stolen_funcs), - INTEL_IVB_M_IDS(&gen6_stolen_funcs), - INTEL_IVB_D_IDS(&gen6_stolen_funcs), - INTEL_HSW_D_IDS(&gen6_stolen_funcs), - INTEL_HSW_M_IDS(&gen6_stolen_funcs), - INTEL_BDW_M_IDS(&gen8_stolen_funcs), - INTEL_BDW_D_IDS(&gen8_stolen_funcs), - INTEL_CHV_IDS(&chv_stolen_funcs), - INTEL_SKL_IDS(&gen9_stolen_funcs), - INTEL_BXT_IDS(&gen9_stolen_funcs), - INTEL_KBL_IDS(&gen9_stolen_funcs), +static const struct pci_device_id intel_early_ids[] __initconst = { + INTEL_I830_IDS(&i830_early_ops), + INTEL_I845G_IDS(&i845_early_ops), + INTEL_I85X_IDS(&i85x_early_ops), + INTEL_I865G_IDS(&i865_early_ops), + INTEL_I915G_IDS(&gen3_early_ops), + INTEL_I915GM_IDS(&gen3_early_ops), + INTEL_I945G_IDS(&gen3_early_ops), + INTEL_I945GM_IDS(&gen3_early_ops), + INTEL_VLV_M_IDS(&gen6_early_ops), + INTEL_VLV_D_IDS(&gen6_early_ops), + INTEL_PINEVIEW_IDS(&gen3_early_ops), + INTEL_I965G_IDS(&gen3_early_ops), + INTEL_G33_IDS(&gen3_early_ops), + INTEL_I965GM_IDS(&gen3_early_ops), + INTEL_GM45_IDS(&gen3_early_ops), + INTEL_G45_IDS(&gen3_early_ops), + INTEL_IRONLAKE_D_IDS(&gen3_early_ops), + INTEL_IRONLAKE_M_IDS(&gen3_early_ops), + INTEL_SNB_D_IDS(&gen6_early_ops), + INTEL_SNB_M_IDS(&gen6_early_ops), + INTEL_IVB_M_IDS(&gen6_early_ops), + INTEL_IVB_D_IDS(&gen6_early_ops), + INTEL_HSW_D_IDS(&gen6_early_ops), + INTEL_HSW_M_IDS(&gen6_early_ops), + INTEL_BDW_M_IDS(&gen8_early_ops), + INTEL_BDW_D_IDS(&gen8_early_ops), + INTEL_CHV_IDS(&chv_early_ops), + INTEL_SKL_IDS(&gen9_early_ops), + INTEL_BXT_IDS(&gen9_early_ops), + INTEL_KBL_IDS(&gen9_early_ops), }; -static void __init intel_graphics_stolen(int num, int slot, int func) +static void __init +intel_graphics_stolen(int num, int slot, int func, + const struct intel_early_ops *early_ops) { + phys_addr_t base, end; size_t size; + + size = early_ops->stolen_size(num, slot, func); + base = early_ops->stolen_base(num, slot, func, size); + + if (!size || !base) + return; + + end = base + size - 1; + printk(KERN_INFO "Reserving Intel graphics memory at %pa-%pa\n", + &base, &end); + + /* Mark this space as reserved */ + e820_add_region(base, size, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + +static void __init intel_graphics_quirks(int num, int slot, int func) +{ + const struct intel_early_ops *early_ops; + u16 device; int i; - u32 start; - u16 device, subvendor, subdevice; device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); - subvendor = read_pci_config_16(num, slot, func, - PCI_SUBSYSTEM_VENDOR_ID); - subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); - - for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { - if (intel_stolen_ids[i].device == device) { - const struct intel_stolen_funcs *stolen_funcs = - (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; - size = stolen_funcs->size(num, slot, func); - start = stolen_funcs->base(num, slot, func, size); - if (size && start) { - printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", - start, start + (u32)size - 1); - /* Mark this space as reserved */ - e820_add_region(start, size, E820_RESERVED); - sanitize_e820_map(e820.map, - ARRAY_SIZE(e820.map), - &e820.nr_map); - } - return; - } + + for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { + kernel_ulong_t driver_data = intel_early_ids[i].driver_data; + + if (intel_early_ids[i].device != device) + continue; + + early_ops = (typeof(early_ops))driver_data; + + intel_graphics_stolen(num, slot, func, early_ops); + + return; } } @@ -690,7 +674,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_stolen }, + QFLAG_APPLY_ONCE, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 9e231d88bb33..a184c210efba 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -159,8 +159,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!access_ok(VERIFY_WRITE, buf, size)) return -EACCES; @@ -268,8 +268,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) u64 xfeatures = 0; int fx_only = 0; - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { fpu__clear(fpu); @@ -416,8 +416,8 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = fpu_user_xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION) || - config_enabled(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) { int fsave_header_size = sizeof(struct fregs_state); fx_sw_reserved_ia32 = fx_sw_reserved; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 680049aa4593..01567aa87503 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } - -/* - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want - * to take out of its "init state". This will ensure that an - * XRSTOR actually restores the state. - */ -static void fpu__xfeature_set_non_init(struct xregs_state *xsave, - int xstate_feature_mask) -{ - xsave->header.xfeatures |= xstate_feature_mask; -} - -/* - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) - * @xsave_state_ptr: a pointer to a copy of the state that you would - * like written in to the current task's FPU xsave state. This pointer - * must not be located in the current tasks's xsave area. - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. - */ -static void fpu__xfeature_set_state(int xstate_feature_mask, - void *xstate_feature_src, size_t len) -{ - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; - struct fpu *fpu = ¤t->thread.fpu; - void *dst; - - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); - return; - } - - /* - * Tell the FPU code that we need the FPU state to be in - * 'fpu' (not in the registers), and that we need it to - * be stable while we write to it. - */ - fpu__current_fpstate_write_begin(); - - /* - * This method *WILL* *NOT* work for compact-format - * buffers. If the 'xstate_feature_mask' is unset in - * xcomp_bv then we may need to move other feature state - * "up" in the buffer. - */ - if (xsave->header.xcomp_bv & xstate_feature_mask) { - WARN_ON_ONCE(1); - goto out; - } - - /* find the location in the xsave buffer of the desired state */ - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); - - /* - * Make sure that the pointer being passed in did not - * come from the xsave buffer itself. - */ - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); - - /* put the caller-provided data in the location */ - memcpy(dst, xstate_feature_src, len); - - /* - * Mark the xfeature so that the CPU knows there is state - * in the buffer now. - */ - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); -out: - /* - * We are done writing to the 'fpu'. Reenable preeption - * and (possibly) move the fpstate back in to the fpregs. - */ - fpu__current_fpstate_write_end(); -} - #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* - * This will go out and modify the XSAVE buffer so that PKRU is - * set to a particular state for access to 'pkey'. - * - * PKRU state does affect kernel access to user memory. We do - * not modfiy PKRU *itself* here, only the XSAVE state that will - * be restored in to PKRU when we return back to userspace. + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct pkru_state *old_pkru_state; - struct pkru_state new_pkru_state; + u32 old_pkru; int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); u32 new_pkru_bits = 0; @@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; + /* + * For most XSAVE components, this would be an arduous task: + * brining fpstate up to date with fpregs, updating fpstate, + * then re-populating fpregs. But, for components that are + * never lazily managed, we can just access the fpregs + * directly. PKRU is never managed lazily, so we can just + * manipulate it directly. Make sure it stays that way. + */ + WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) @@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, /* Shift the bits in to the correct place in PKRU for pkey: */ new_pkru_bits <<= pkey_shift; - /* Locate old copy of the state in the xsave buffer: */ - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); - - /* - * When state is not in the buffer, it is in the init - * state, set it manually. Otherwise, copy out the old - * state. - */ - if (!old_pkru_state) - new_pkru_state.pkru = 0; - else - new_pkru_state.pkru = old_pkru_state->pkru; - - /* Mask off any old bits in place: */ - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - - /* Set the newly-requested bits: */ - new_pkru_state.pkru |= new_pkru_bits; - - /* - * We could theoretically live without zeroing pkru.pad. - * The current XSAVE feature state definition says that - * only bytes 0->3 are used. But we do not want to - * chance leaking kernel stack out to userspace in case a - * memcpy() of the whole xsave buffer was done. - * - * They're in the same cacheline anyway. - */ - new_pkru_state.pad = 0; + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 2dda0bc4576e..f16c55bfc090 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void) /* Initialize 32bit specific setup functions */ x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - - reserve_bios_regions(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 99d48e7d2974..54a2372f5dbb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_bios_regions(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f112af7aa62e..c6dfd801df97 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -710,31 +710,29 @@ static void hpet_work(struct work_struct *w) complete(&hpet_work->complete); } -static int hpet_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) +static int hpet_cpuhp_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; struct hpet_work_struct work; + + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + destroy_delayed_work_on_stack(&work.work); + return 0; +} + +static int hpet_cpuhp_dead(unsigned int cpu) +{ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); - init_completion(&work.complete); - /* FIXME: add schedule_work_on() */ - schedule_delayed_work_on(cpu, &work.work, 0); - wait_for_completion(&work.complete); - destroy_delayed_work_on_stack(&work.work); - break; - case CPU_DEAD: - if (hdev) { - free_irq(hdev->irq, hdev); - hdev->flags &= ~HPET_DEV_USED; - per_cpu(cpu_hpet_dev, cpu) = NULL; - } - break; - } - return NOTIFY_OK; + if (!hdev) + return 0; + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + return 0; } #else @@ -750,11 +748,8 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) } #endif -static int hpet_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - return NOTIFY_OK; -} +#define hpet_cpuhp_online NULL +#define hpet_cpuhp_dead NULL #endif @@ -931,7 +926,7 @@ out_nohpet: */ static __init int hpet_late_init(void) { - int cpu; + int ret; if (boot_hpet_disable) return -ENODEV; @@ -961,16 +956,20 @@ static __init int hpet_late_init(void) if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) { - hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); - } - /* This notifier should be called after workqueue is ready */ - __hotcpu_notifier(hpet_cpuhp_notify, -20); - cpu_notifier_register_done(); - + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE", + hpet_cpuhp_online, NULL); + if (ret) + return ret; + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL, + hpet_cpuhp_dead); + if (ret) + goto err_cpuhp; return 0; + +err_cpuhp: + cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE); + return ret; } fs_initcall(hpet_late_init); @@ -1020,7 +1019,6 @@ void hpet_disable(void) */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> -#include <asm/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -1244,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - get_rtc_time(&curr_time); + mc146818_get_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 2bcfb5f2bc44..8771766d46b6 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -36,13 +36,14 @@ #include <linux/percpu.h> #include <linux/kdebug.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/smp.h> #include <asm/hw_breakpoint.h> #include <asm/processor.h> #include <asm/debugreg.h> +#include <asm/user.h> /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index d40ee8a38fed..1f9b878ef5ef 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,4 +1,5 @@ -#include <linux/module.h> +#include <linux/export.h> +#include <linux/spinlock_types.h> #include <asm/checksum.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index efb82f07b29c..6ebe00cb4a3b 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -3,7 +3,7 @@ * */ #include <linux/clockchips.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/timex.h> #include <linux/i8253.h> diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index a979b5bd2fc0..50c89e8a95f2 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -6,7 +6,7 @@ * outb_p/inb_p API uses. */ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/dmi.h> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 61521dc19c10..9f669fdd2010 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - - irq_stats(j)->irq_tlb_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index c627bf8d98ad..1f38d9a4d9de 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -8,7 +8,6 @@ * io_apic.c.) */ -#include <linux/module.h> #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 206d0b90a3ab..4a7903714065 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -11,7 +11,6 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/seq_file.h> -#include <linux/module.h> #include <linux/delay.h> #include <linux/ftrace.h> #include <linux/uaccess.h> diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index dc1404bf8e4b..bdb83e431d89 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -8,7 +8,7 @@ */ #include <linux/debugfs.h> #include <linux/uaccess.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/stat.h> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1ef5e48b3a36..1726c4c12336 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -21,7 +21,7 @@ */ #include <linux/context_tracking.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd26bb..3692249a70f1 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 97340f2c437c..068c4a929de6 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -16,7 +16,6 @@ #include <linux/mc146818rtc.h> #include <linux/bitops.h> #include <linux/acpi.h> -#include <linux/module.h> #include <linux/smp.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 04b132a767f1..bfe4d6c96fbd 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -17,6 +17,7 @@ #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/hardirq.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 33ee3e0efd65..1939a0269377 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -3,7 +3,7 @@ * compiled in a FTRACE-compatible way. */ #include <linux/spinlock.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/jump_label.h> #include <asm/paravirt.h> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7b3b3f24c3ea..1acfd76e3e26 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -19,7 +19,8 @@ */ #include <linux/errno.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> @@ -55,12 +56,12 @@ asm (".pushsection .entry.text, \"ax\"\n" ".popsection"); /* identity function, which can be inlined */ -u32 _paravirt_ident_32(u32 x) +u32 notrace _paravirt_ident_32(u32 x) { return x; } -u64 _paravirt_ident_64(u64 x) +u64 notrace _paravirt_ident_64(u64 x) { return x; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 833b1d329c47..5d400ba1349d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -340,7 +340,7 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); struct scatterlist *s; @@ -364,7 +364,7 @@ static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); struct scatterlist *s; @@ -396,7 +396,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, return nelems; error: - calgary_unmap_sg(dev, sg, nelems, dir, NULL); + calgary_unmap_sg(dev, sg, nelems, dir, 0); for_each_sg(sg, s, nelems, i) { sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; @@ -407,7 +407,7 @@ error: static dma_addr_t calgary_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr = page_address(page) + offset; unsigned long uaddr; @@ -422,7 +422,7 @@ static dma_addr_t calgary_map_page(struct device *dev, struct page *page, static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; @@ -432,7 +432,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, } static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) + dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { void *ret = NULL; dma_addr_t mapping; @@ -466,7 +466,7 @@ error: static void calgary_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 6ba014c61d62..d30c37750765 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -77,7 +77,7 @@ void __init pci_iommu_alloc(void) } void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long dma_mask; struct page *page; @@ -120,7 +120,7 @@ again: } void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) + dma_addr_t dma_addr, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page *page = virt_to_page(vaddr); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index da15918d1c81..00e71ce396a8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -28,7 +28,7 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) static dma_addr_t nommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); @@ -55,7 +55,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, */ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct scatterlist *s; int i; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7c577a178859..b47edb8f5256 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -2,7 +2,7 @@ #include <linux/pci.h> #include <linux/cache.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/swiotlb.h> #include <linux/bootmem.h> #include <linux/dma-mapping.h> @@ -16,7 +16,7 @@ int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr; @@ -37,7 +37,7 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, - struct dma_attrs *attrs) + unsigned long attrs) { if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) swiotlb_free_coherent(dev, size, vaddr, dma_addr); diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 92f70147a9a6..0c5315d322c8 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,7 +3,7 @@ * Copyright (c) 2015, Intel Corporation. */ #include <linux/platform_device.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/ioport.h> static int found(u64 start, u64 end, void *data) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 96becbbb52e0..62c0b0ea2ce4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,7 +7,8 @@ #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/pm.h> #include <linux/tick.h> #include <linux/random.h> @@ -404,7 +405,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9f950917528b..d86be29c38c7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -25,7 +25,7 @@ #include <linux/delay.h> #include <linux/reboot.h> #include <linux/mc146818rtc.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> #include <linux/personality.h> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6e789ca1f841..63236d8f84bf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,7 @@ #include <linux/user.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/notifier.h> #include <linux/kprobes.h> diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 600edd225e81..f79576a541ff 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -923,15 +923,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * A 32-bit debugger setting orig_eax means to restore - * the state of the task restarting a 32-bit syscall. - * Make sure we interpret the -ERESTART* codes correctly - * in case the task is not actually still sitting at the - * exit from a 32-bit syscall with TS_COMPAT still set. + * Warning: bizarre corner case fixup here. A 32-bit + * debugger setting orig_eax to -1 wants to disable + * syscall restart. Make sure that the syscall + * restart code sign-extends orig_ax. Also make sure + * we interpret the -ERESTART* codes correctly if + * loaded into regs->ax in case the task is not + * actually still sitting at the exit from a 32-bit + * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_COMPAT; + task_thread_info(child)->status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 06c58ce46762..3599404e3089 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -64,14 +64,9 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) u8 flags; do { - version = src->version; - /* Make the latest version visible */ - smp_rmb(); - + version = pvclock_read_begin(src); flags = src->flags; - /* Make sure that the version double-check is last. */ - smp_rmb(); - } while ((src->version & 1) || version != src->version); + } while (pvclock_read_retry(src, version)); return flags & valid_flags; } @@ -84,10 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags); - /* Make sure that the version double-check is last. */ - smp_rmb(); - } while ((src->version & 1) || version != src->version); + version = pvclock_read_begin(src); + ret = __pvclock_read_cycles(src); + flags = src->flags; + } while (pvclock_read_retry(src, version)); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 15ed70f8278b..63bf27d972b7 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,6 +1,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> +#include <linux/export.h> #include <linux/reboot.h> #include <linux/init.h> #include <linux/pm.h> diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index eceaa082ec3f..79c6311cd912 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -13,7 +13,6 @@ #include <asm/x86_init.h> #include <asm/time.h> #include <asm/intel-mid.h> -#include <asm/rtc.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 @@ -47,7 +46,7 @@ int mach_set_rtc_mmss(const struct timespec *now) rtc_time_to_tm(nowtime, &tm); if (!rtc_valid_tm(&tm)) { - retval = set_rtc_time(&tm); + retval = mc146818_set_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", __func__, retval); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6cb2b02fcc87..0fa60f5f5a16 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -36,7 +36,7 @@ #include <linux/console.h> #include <linux/root_dev.h> #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/efi.h> #include <linux/init.h> #include <linux/edd.h> @@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); - kernel_randomize_memory(); - iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); @@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); @@ -1097,6 +1101,8 @@ void __init setup_arch(char **cmdline_p) efi_find_mirror(); } + reserve_bios_regions(); + /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. @@ -1125,7 +1131,15 @@ void __init setup_arch(char **cmdline_p) early_trap_pf_init(); - setup_real_mode(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + */ + if (boot_cpu_data.cpuid_level >= 0) + /* A CPU has %cr4 if and only if it has CPUID. */ + mmu_cr4_features = __read_cr4(); memblock_set_current_limit(get_max_mapped()); @@ -1174,13 +1188,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = __read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } - #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 22cc2f9f8aec..04cb3212db2d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -146,7 +146,7 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); - err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -245,14 +245,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, struct fpu *fpu = ¤t->thread.fpu; /* redzone */ - if (config_enabled(CONFIG_X86_64)) + if (IS_ENABLED(CONFIG_X86_64)) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(sp) == 0) sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && + } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && (regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -262,7 +262,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } if (fpu->fpstate_active) { - sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -662,18 +662,18 @@ badframe: static inline int is_ia32_compat_frame(void) { - return config_enabled(CONFIG_IA32_EMULATION) && + return IS_ENABLED(CONFIG_IA32_EMULATION) && test_thread_flag(TIF_IA32); } static inline int is_ia32_frame(void) { - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(); } static inline int is_x32_frame(void) { - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); + return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); } static int @@ -760,8 +760,30 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#ifdef CONFIG_X86_64 - if (in_ia32_syscall()) + /* + * This function is fundamentally broken as currently + * implemented. + * + * The idea is that we want to trigger a call to the + * restart_block() syscall and that we want in_ia32_syscall(), + * in_x32_syscall(), etc. to match whatever they were in the + * syscall being restarted. We assume that the syscall + * instruction at (regs->ip - 2) matches whatever syscall + * instruction we used to enter in the first place. + * + * The problem is that we can get here when ptrace pokes + * syscall-like values into regs even if we're not in a syscall + * at all. + * + * For now, we maintain historical behavior and guess based on + * stored state. We could do better by saving the actual + * syscall arch in restart_block or (with caveats on x32) by + * checking if regs->ip points to 'int $0x80'. The current + * behavior is incorrect if a tracer has a different bitness + * than the tracee. + */ +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c93609c97406..4296beb8fdd3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -43,7 +43,7 @@ #include <linux/init.h> #include <linux/smp.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/percpu.h> #include <linux/bootmem.h> @@ -100,10 +100,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ static int *physical_to_logical_pkg __read_mostly; static unsigned long *physical_package_map __read_mostly;; -static unsigned long *logical_package_map __read_mostly; static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; @@ -277,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - new = find_first_zero_bit(logical_package_map, __max_logical_packages); - if (new >= __max_logical_packages) { + if (logical_packages_frozen) { physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package map\n", + pr_warn("APIC(%x) Package %u exceeds logical package max\n", apicid, pkg); return -ENOSPC; } - set_bit(new, logical_package_map); + + new = logical_packages++; pr_info("APIC(%x) Converting physical %u to logical package %u\n", apicid, pkg, new); physical_to_logical_pkg[pkg] = new; @@ -341,6 +342,7 @@ static void __init smp_init_package_map(void) } __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + logical_packages = 0; /* * Possibly larger than what we need as the number of apic ids per @@ -352,10 +354,6 @@ static void __init smp_init_package_map(void) memset(physical_to_logical_pkg, 0xff, size); size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); - logical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); for_each_present_cpu(cpu) { unsigned int apicid = apic->cpu_present_to_apicid(cpu); @@ -369,6 +367,15 @@ static void __init smp_init_package_map(void) set_cpu_possible(cpu, false); set_cpu_present(cpu, false); } + + if (logical_packages > __max_logical_packages) { + pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", + logical_packages, __max_logical_packages); + logical_packages_frozen = true; + __max_logical_packages = logical_packages; + } + + pr_info("Max logical packages: %u\n", __max_logical_packages); } void __init smp_store_boot_cpu_info(void) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 9ee98eefc44d..4738f5e0f2ab 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -5,7 +5,7 @@ */ #include <linux/sched.h> #include <linux/stacktrace.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 9b0185fbe3eb..654f6c66fe45 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -323,25 +323,16 @@ static int tboot_wait_for_aps(int num_aps) return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } -static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int tboot_dying_cpu(unsigned int cpu) { - switch (action) { - case CPU_DYING: - atomic_inc(&ap_wfs_count); - if (num_online_cpus() == 1) - if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) - return NOTIFY_BAD; - break; + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) { + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return -EBUSY; } - return NOTIFY_OK; + return 0; } -static struct notifier_block tboot_cpu_notifier = -{ - .notifier_call = tboot_cpu_callback, -}; - #ifdef CONFIG_DEBUG_FS #define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \ @@ -417,8 +408,8 @@ static __init int tboot_late_init(void) tboot_create_trampoline(); atomic_set(&ap_wfs_count, 0); - register_hotcpu_notifier(&tboot_cpu_notifier); - + cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL, + tboot_dying_cpu); #ifdef CONFIG_DEBUG_FS debugfs_create_file("tboot_log", S_IRUSR, arch_debugfs_dir, NULL, &tboot_log_fops); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index cb4a01b41e27..222e84e2432e 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -9,7 +9,6 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include <linux/module.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/asm.h> @@ -74,7 +73,3 @@ int rodata_test(void) return 0; } - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); -MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 00f03d82e69a..b70ca12dd389 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -21,7 +21,7 @@ #include <linux/kdebug.h> #include <linux/kgdb.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/string.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a804b5ab32d0..78b9cb5a26af 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -3,7 +3,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/timer.h> #include <linux/acpi_pmtmr.h> #include <linux/cpufreq.h> @@ -22,6 +22,7 @@ #include <asm/nmi.h> #include <asm/x86_init.h> #include <asm/geode.h> +#include <asm/apic.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1249,6 +1250,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31d99ff..495c776de4b4 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* - * Similar treatment for VEX3 prefix. - * TODO: add XOP/EVEX treatment when insn decoder supports them + * Similar treatment for VEX3/EVEX prefix. + * TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5 rvvvvLpp (has no b bit) * vex3/xop: c4/8f rxbmmmmm wvvvvLpp * evex: 62 rxbR00mm wvvvv1pp zllBVaaa - * (evex will need setting of both b and x since - * in non-sib encoding evex.x is 4th bit of MODRM.rm) - * Setting VEX3.b (setting because it has inverted meaning): + * Setting VEX3.b (setting because it has inverted meaning). + * Setting EVEX.x since (in non-SIB encoding) EVEX.x + * is the 4th bit of MODRM.rm, and needs the same treatment. + * For VEX3-encoded insns, VEX3.x value has no effect in + * non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff; /* Fetch vex.vvvv */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* - * TODO: add XOP, EXEV vvvv reading. + * TODO: add XOP vvvv reading. * * vex.vvvv field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index f1aebfb49c36..95e49f6e4fc3 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,7 +1,8 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ -#include <linux/module.h> +#include <linux/export.h> +#include <linux/spinlock_types.h> #include <linux/smp.h> #include <net/checksum.h> diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 58b459296e13..76c5e52436c4 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,7 +5,7 @@ */ #include <linux/init.h> #include <linux/ioport.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/pci.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 639a6e34500c..ab8e32f7b9a8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -32,7 +32,6 @@ config KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_EVENTFD - select KVM_APIC_ARCHITECTURE select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7597b42a8a88..3235e0fe7792 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -13,7 +13,7 @@ */ #include <linux/kvm_host.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ @@ -366,7 +366,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index e17a74b1d852..35058c2c0eea 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,14 +144,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } -static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); -} - static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a2f24af3c999..4e95d3eb2955 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,7 +22,6 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" -#include <linux/module.h> #include <asm/kvm_emulate.h> #include <linux/stringify.h> #include <asm/debugreg.h> diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a4bf5b45d65a..5fb6c620180e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = { .write = speaker_ioport_write, }; -/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_pit_set_reinject(pit, true); + mutex_lock(&kvm->slots_lock); kvm_iodevice_init(&pit->dev, &pit_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, KVM_PIT_MEM_LENGTH, &pit->dev); @@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) if (ret < 0) goto fail_register_speaker; } + mutex_unlock(&kvm->slots_lock); return pit; fail_register_speaker: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail_register_pit: + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); kthread_stop(pit->worker_task); fail_kthread: diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 5f42d038fcb4..c7220ba94aa7 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -109,6 +109,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) { bool new_val, old_val; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; @@ -117,16 +118,17 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + old_val = test_bit(vcpu->vcpu_id, dest_map->map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __set_bit(vcpu->vcpu_id, dest_map->map); + dest_map->vectors[vcpu->vcpu_id] = e->fields.vector; ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __clear_bit(vcpu->vcpu_id, dest_map->map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 3069281904d3..b181426f67b4 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -25,12 +25,10 @@ #include <linux/list.h> #include <linux/kvm_host.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/pci.h> #include <linux/stat.h> -#include <linux/dmar.h> #include <linux/iommu.h> -#include <linux/intel-iommu.h> #include "assigned-dev.h" static bool allow_unsafe_assigned_interrupts; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 95fcc7b13866..60d91c9d160c 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -20,7 +20,7 @@ * */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kvm_host.h> #include "irq.h" diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 61ebdc13a29a..035731eb3897 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -120,4 +120,7 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); +int kvm_setup_default_irq_routing(struct kvm *kvm); +int kvm_setup_empty_irq_routing(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index dfb4c6476877..25810b144b58 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? + (u64)e->msi.address_hi << 32 : 0), + e->msi.data); irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + if (kvm->arch.x2apic_format) + irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; @@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + if (!level) return -1; - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) return -EWOULDBLOCK; - kvm_set_msi_irq(e, &irq); + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; @@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + goto out; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm->arch.nr_reserved_ioapic_pins); for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { - u32 dest_id, dest_mode; - bool level; + struct kvm_lapic_irq irq; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - dest_id = (entry->msi.address_lo >> 12) & 0xff; - dest_mode = (entry->msi.address_lo >> 2) & 0x1; - level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; - if (level && kvm_apic_match_dest(vcpu, NULL, 0, - dest_id, dest_mode)) { - u32 vector = entry->msi.data & 0xff; - - __set_bit(vector, - ioapic_handled_vectors); - } + + kvm_set_msi_irq(vcpu->kvm, entry, &irq); + + if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + irq.dest_id, irq.dest_mode)) + __set_bit(irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a397200281c1..b62c85229711 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -25,7 +25,7 @@ #include <linux/smp.h> #include <linux/hrtimer.h> #include <linux/io.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> @@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -/* The logical map is definitely wrong if we have multiple - * modes at the same time. (Physical map is always right.) - */ -static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) -{ - return !(map->mode & (map->mode - 1)); +static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, + u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { + switch (map->mode) { + case KVM_APIC_MODE_X2APIC: { + u32 offset = (dest_id >> 16) * 16; + u32 max_apic_id = map->max_apic_id; + + if (offset <= max_apic_id) { + u8 cluster_size = min(max_apic_id - offset + 1, 16U); + + *cluster = &map->phys_map[offset]; + *mask = dest_id & (0xffff >> (16 - cluster_size)); + } else { + *mask = 0; + } + + return true; + } + case KVM_APIC_MODE_XAPIC_FLAT: + *cluster = map->xapic_flat_map; + *mask = dest_id & 0xff; + return true; + case KVM_APIC_MODE_XAPIC_CLUSTER: + *cluster = map->xapic_cluster_map[dest_id >> 4]; + *mask = dest_id & 0xf; + return true; + default: + /* Not optimized. */ + return false; + } } -static inline void -apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) +static void kvm_apic_map_free(struct rcu_head *rcu) { - unsigned lid_bits; - - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); - BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); - lid_bits = map->mode; + struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - *cid = dest_id >> lid_bits; - *lid = dest_id & ((1 << lid_bits) - 1); + kvfree(map); } static void recalculate_apic_map(struct kvm *kvm) @@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - - new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + u32 max_id = 255; mutex_lock(&kvm->arch.apic_map_lock); + kvm_for_each_vcpu(i, vcpu, kvm) + if (kvm_apic_present(vcpu)) + max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + + new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + if (!new) goto out; + new->max_apic_id = max_id; + kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; + struct kvm_lapic **cluster; + u16 mask; u32 ldr, aid; if (!kvm_apic_present(vcpu)) @@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm) aid = kvm_apic_id(apic); ldr = kvm_lapic_get_reg(apic, APIC_LDR); - if (aid < ARRAY_SIZE(new->phys_map)) + if (aid <= new->max_apic_id) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { @@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; } - if (!kvm_apic_logical_map_valid(new)) + if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) continue; - apic_logical_id(new, ldr, &cid, &lid); - - if (lid && cid < ARRAY_SIZE(new->logical_map)) - new->logical_map[cid][ffs(lid) - 1] = apic; + if (mask) + cluster[ffs(mask) - 1] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, @@ -189,7 +213,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, kvm_apic_map_free); kvm_make_scan_ioapic_request(kvm); } @@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } } -static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); @@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } -static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } @@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* KVM APIC implementation has two quirks - * - dest always begins at 0 while xAPIC MDA has offset 24, - * - IOxAPIC messages have to be delivered (directly) to x2APIC. +/* The KVM local APIC implementation has two quirks: + * + * - the xAPIC MDA stores the destination at bits 24-31, while this + * is not true of struct kvm_lapic_irq's dest_id field. This is + * just a quirk in the API and is not problematic. + * + * - in-kernel IOAPIC messages have to be delivered directly to + * x2APIC, because the kernel does not support interrupt remapping. + * In order to support broadcast without interrupt remapping, x2APIC + * rewrites the destination of non-IPI messages from APIC_BROADCAST + * to X2APIC_BROADCAST. + * + * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * important when userspace wants to use x2APIC-format MSIs, because + * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ -static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, - struct kvm_lapic *target) +static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, + struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); - if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && + !ipi && dest_id == APIC_BROADCAST && x2apic_mda) return X2APIC_BROADCAST; return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); @@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; - u32 mda = kvm_apic_mda(dest, source, target); + u32 mda = kvm_apic_mda(vcpu, dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } -bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, + struct kvm_lapic_irq *irq, struct kvm_apic_map *map) { - struct kvm_apic_map *map; - unsigned long bitmap = 1; - struct kvm_lapic **dst; - int i; - bool ret, x2apic_ipi; + if (kvm->arch.x2apic_broadcast_quirk_disabled) { + if ((irq->dest_id == APIC_BROADCAST && + map->mode != KVM_APIC_MODE_X2APIC)) + return true; + if (irq->dest_id == X2APIC_BROADCAST) + return true; + } else { + bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? + X2APIC_BROADCAST : APIC_BROADCAST)) + return true; + } - *r = -1; + return false; +} - if (irq->shorthand == APIC_DEST_SELF) { - *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); - return true; - } +/* Return true if the interrupt can be handled by using *bitmap as index mask + * for valid destinations in *dst array. + * Return false if kvm_apic_map_get_dest_lapic did nothing useful. + * Note: we may have zero kvm_lapic destinations when we return true, which + * means that the interrupt should be dropped. In this case, *bitmap would be + * zero and *dst undefined. + */ +static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, + struct kvm_lapic **src, struct kvm_lapic_irq *irq, + struct kvm_apic_map *map, struct kvm_lapic ***dst, + unsigned long *bitmap) +{ + int i, lowest; - if (irq->shorthand) + if (irq->shorthand == APIC_DEST_SELF && src) { + *dst = src; + *bitmap = 1; + return true; + } else if (irq->shorthand) return false; - x2apic_ipi = src && apic_x2apic_mode(src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) + if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; - ret = true; - rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); - - if (!map) { - ret = false; - goto out; + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id > map->max_apic_id) { + *bitmap = 0; + } else { + *dst = &map->phys_map[irq->dest_id]; + *bitmap = 1; + } + return true; } - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; + *bitmap = 0; + if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, + (u16 *)bitmap)) + return false; - dst = &map->phys_map[irq->dest_id]; - } else { - u16 cid; + if (!kvm_lowest_prio_delivery(irq)) + return true; - if (!kvm_apic_logical_map_valid(map)) { - ret = false; - goto out; + if (!kvm_vector_hashing_enabled()) { + lowest = -1; + for_each_set_bit(i, bitmap, 16) { + if (!(*dst)[i]) + continue; + if (lowest < 0) + lowest = i; + else if (kvm_apic_compare_prio((*dst)[i]->vcpu, + (*dst)[lowest]->vcpu) < 0) + lowest = i; } + } else { + if (!*bitmap) + return true; - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), + bitmap, 16); - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; + if (!(*dst)[lowest]) { + kvm_apic_disabled_lapic_found(kvm); + *bitmap = 0; + return true; + } + } - dst = map->logical_map[cid]; + *bitmap = (lowest >= 0) ? 1 << lowest : 0; - if (!kvm_lowest_prio_delivery(irq)) - goto set_irq; + return true; +} - if (!kvm_vector_hashing_enabled()) { - int l = -1; - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (l < 0) - l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, - dst[l]->vcpu) < 0) - l = i; - } - bitmap = (l >= 0) ? 1 << l : 0; - } else { - int idx; - unsigned int dest_vcpus; +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +{ + struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; + int i; + bool ret; - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; + *r = -1; - idx = kvm_vector_to_index(irq->vector, - dest_vcpus, &bitmap, 16); + if (irq->shorthand == APIC_DEST_SELF) { + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); + return true; + } - if (!dst[idx]) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); - bitmap = (idx >= 0) ? 1 << idx : 0; + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); + if (ret) + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - } -set_irq: - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (*r < 0) - *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); - } -out: rcu_read_unlock(); return ret; } @@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; bool ret = false; - struct kvm_lapic *dst = NULL; if (irq->shorthand) return false; @@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) - goto out; - - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id == 0xFF) - goto out; - - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = map->phys_map[irq->dest_id]; - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; - } else { - u16 cid; - unsigned long bitmap = 1; - int i, r = 0; - - if (!kvm_apic_logical_map_valid(map)) - goto out; - - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - if (kvm_vector_hashing_enabled() && - kvm_lowest_prio_delivery(irq)) { - int idx; - unsigned int dest_vcpus; + if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && + hweight16(bitmap) == 1) { + unsigned long i = find_first_bit(&bitmap, 16); - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, dest_vcpus, - &bitmap, 16); - - dst = map->logical_map[cid][idx]; - if (!dst) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - *dest_vcpu = dst->vcpu; - } else { - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) - goto out; - } - - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; + if (dst[i]) { + *dest_vcpu = dst[i]->vcpu; + ret = true; } } - ret = true; -out: rcu_read_unlock(); return ret; } @@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return 0; switch (offset) { - case APIC_ID: - if (apic_x2apic_mode(apic)) - val = kvm_apic_id(apic); - else - val = kvm_apic_id(apic) << 24; - break; case APIC_ARBPRI: apic_debug("Access APIC ARBPRI register which is for P6\n"); break; @@ -1314,6 +1317,111 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) nsec_to_cycles(vcpu, lapic_timer_advance_ns))); } +static void start_sw_tscdeadline(struct kvm_lapic *apic) +{ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + ktime_t expire; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; + unsigned long flags; + ktime_t now; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + expire = ktime_add_ns(now, ns); + expire = ktime_sub_ns(expire, lapic_timer_advance_ns); + hrtimer_start(&apic->lapic_timer.timer, + expire, HRTIMER_MODE_ABS_PINNED); + } else + apic_timer_expired(apic); + + local_irq_restore(flags); +} + +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu)) + return false; + + return vcpu->arch.apic->lapic_timer.hv_timer_in_use; +} +EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); + +static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +{ + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + apic->lapic_timer.hv_timer_in_use = false; +} + +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_tscdeadline(apic); + apic_timer_expired(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + +static bool start_hv_tscdeadline(struct kvm_lapic *apic) +{ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (atomic_read(&apic->lapic_timer.pending) || + kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_tscdeadline(apic); + } else { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); + } + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + return apic->lapic_timer.hv_timer_in_use; +} + +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(apic->lapic_timer.hv_timer_in_use); + + if (apic_lvtt_tscdeadline(apic)) + start_hv_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); + +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + /* Possibly the TSC deadline timer is not enabled yet */ + if (!apic->lapic_timer.hv_timer_in_use) + return; + + cancel_hv_tscdeadline(apic); + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + start_sw_tscdeadline(apic); +} +EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); + static void start_apic_timer(struct kvm_lapic *apic) { ktime_t now; @@ -1360,32 +1468,8 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; - u64 ns = 0; - ktime_t expire; - struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; - unsigned long flags; - - if (unlikely(!tscdeadline || !this_tsc_khz)) - return; - - local_irq_save(flags); - - now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); - expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); - } else - apic_timer_expired(apic); - - local_irq_restore(flags); + if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + start_sw_tscdeadline(apic); } } @@ -1413,7 +1497,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - kvm_apic_set_id(apic, val >> 24); + kvm_apic_set_xapic_id(apic, val >> 24); else ret = 1; break; @@ -1674,9 +1758,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { - if (value & MSR_IA32_APICBASE_ENABLE) + if (value & MSR_IA32_APICBASE_ENABLE) { + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - else + } else static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } @@ -1716,8 +1801,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) { + kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE); + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -1856,9 +1944,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - kvm_lapic_set_base(vcpu, - APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -1938,17 +2023,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s, bool set) +{ + if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 *id = (u32 *)(s->regs + APIC_ID); + + if (vcpu->kvm->arch.x2apic_format) { + if (*id != vcpu->vcpu_id) + return -EINVAL; + } else { + if (set) + *id >>= 24; + else + *id <<= 24; + } + } + + return 0; +} + +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + return kvm_apic_state_fixup(vcpu, s, false); +} + +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; + int r; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + + r = kvm_apic_state_fixup(vcpu, s, true); + if (r) + return r; memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - /* call kvm_apic_set_id() to put apic into apic_map */ - kvm_apic_set_id(apic, kvm_apic_id(apic)); + + recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -1974,6 +2090,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_rtc_eoi_tracking_restore_one(vcpu); vcpu->arch.apic_arb_prio = 0; + + return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 891c6da7d4aa..f60d01c29d51 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -20,6 +20,7 @@ struct kvm_timer { u64 tscdeadline; u64 expired_tscdeadline; atomic_t pending; /* accumulated triggered timers */ + bool hv_timer_in_use; }; struct kvm_lapic { @@ -80,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s); +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -199,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline int kvm_apic_id(struct kvm_lapic *apic) +static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); @@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, const unsigned long *bitmap, u32 bitmap_size); +void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index def97b3a392b..3d4cc8cc56a3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -29,7 +29,8 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/export.h> #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> @@ -175,6 +176,7 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_present_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -282,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_present_mask = p_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -304,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); + return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -523,7 +526,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) } /* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changged. + * Update the state bits, it means the mapped pfn is not changed. * * Whenever we overwrite a writable spte with a read-only one we * should flush remote TLBs. Otherwise rmap_write_protect @@ -2245,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || - VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2515,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte; + u64 spte = 0; int ret = 0; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - spte = PT_PRESENT_MASK; + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; if (!speculative) spte |= shadow_accessed_mask; @@ -3189,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); - if (!is_present_gpte(pdptr)) { + if (!(pdptr & PT_PRESENT_MASK)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } @@ -3914,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * clearer. */ smap = cr4_smap && u && !uf && !ff; - } else - /* Not really needed: no U/S accesses on ept */ - u = 1; + } fault = (ff && !x) || (uf && !u) || (wf && !w) || (smapf && smap); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 66b33b96a31b..ddc56e91f2e4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_present_gpte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bc019f70e0b6..a01105485315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT - return is_present_gpte(pte); + return pte & PT_PRESENT_MASK; #else return pte & 7; #endif @@ -181,13 +181,19 @@ no_present: return true; } +/* + * For PTTYPE_EPT, a page table can be executable but not readable + * on supported processors. Therefore, set_spte does not automatically + * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK + * to signify readability since it isn't used in the EPT case + */ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ACC_USER_MASK; + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 39b91127ef07..cd944435dfbd 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -23,8 +23,8 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES }, + [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c index ab38af4f4947..9d4a8504a95a 100644 --- a/arch/x86/kvm/pmu_intel.c +++ b/arch/x86/kvm/pmu_intel.c @@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx) return intel_arch_events[fixed_pmc_events[idx]].event_type; } -/* check if a PMC is enabled by comparising it with globl_ctrl bits. */ +/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 16ef31b87452..af523d84d102 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1577,7 +1577,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { /* - * Any change of EFLAGS.VM is accompained by a reload of SS + * Any change of EFLAGS.VM is accompanied by a reload of SS * (caused by either a task switch or an inter-privilege IRET), * so we do not need to update the CPL here. */ @@ -4940,6 +4940,12 @@ out: static void svm_handle_external_intr(struct kvm_vcpu *vcpu) { local_irq_enable(); + /* + * We must have an instruction with interrupts enabled, so + * the timer interrupt isn't delayed by the interrupt shadow. + */ + asm("nop"); + local_irq_disable(); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 8de925031b5c..0a6cc6754ec5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec) ); +TRACE_EVENT(kvm_hv_timer_state, + TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), + TP_ARGS(vcpu_id, hv_timer_in_use), + TP_STRUCT__entry( + __field(unsigned int, vcpu_id) + __field(unsigned int, hv_timer_in_use) + ), + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hv_timer_in_use = hv_timer_in_use; + ), + TP_printk("vcpu_id %x hv_timer %x\n", + __entry->vcpu_id, + __entry->hv_timer_in_use) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7758680db20b..5cede40e2552 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */ +static int __read_mostly cpu_preemption_timer_multi; +static bool __read_mostly enable_preemption_timer = 1; +#ifdef CONFIG_X86_64 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); +#endif + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -398,6 +405,12 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMXOFF, VMCLEAR, VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; struct vmcs *current_shadow_vmcs; /* * Indicates if the shadow vmcs must be updated with the @@ -409,6 +422,7 @@ struct nested_vmx { struct list_head vmcs02_pool; int vmcs02_num; u64 vmcs01_tsc_offset; + bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; /* @@ -421,7 +435,8 @@ struct nested_vmx { struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; - u64 msr_ia32_feature_control; + + unsigned long *msr_bitmap; struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -597,11 +612,22 @@ struct vcpu_vmx { #define PML_ENTITY_NUM 512 struct page *pml_pg; + /* apic deadline value in host tsc */ + u64 hv_deadline_tsc; + u64 current_tsc_ratio; bool guest_pkru_valid; u32 guest_pkru; u32 host_pkru; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; }; enum segment_cache_field { @@ -841,7 +867,7 @@ static inline short vmcs_field_to_offset(unsigned long field) static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { - return to_vmx(vcpu)->nested.current_vmcs12; + return to_vmx(vcpu)->nested.cached_vmcs12; } static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) @@ -901,7 +927,6 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -1056,6 +1081,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +/* + * Comment's format: document - errata name - stepping - processor name. + * Refer from + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp + */ +static u32 vmx_preemption_cpu_tfms[] = { +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ +0x000206E6, +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */ +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020652, +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ +0x00020655, +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ +/* + * 320767.pdf - AAP86 - B1 - + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile + */ +0x000106E5, +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */ +0x000106A0, +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */ +0x000106A1, +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ +0x000106A4, + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ +0x000106A5, +}; + +static inline bool cpu_has_broken_vmx_preemption_timer(void) +{ + u32 eax = cpuid_eax(0x00000001), i; + + /* Clear the reserved bits */ + eax &= ~(0x3U << 14 | 0xfU << 28); + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) + if (eax == vmx_preemption_cpu_tfms[i]) + return true; + + return false; +} + +static inline bool cpu_has_vmx_preemption_timer(void) +{ + return vmcs_config.pin_based_exec_ctrl & + PIN_BASED_VMX_PREEMPTION_TIMER; +} + static inline bool cpu_has_vmx_posted_intr(void) { return IS_ENABLED(CONFIG_X86_LOCAL_APIC) && @@ -1603,6 +1680,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) __vmcs_writel(field, __vmcs_readl(field) | mask); } +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_ENTRY_CONTROLS, val); @@ -1631,6 +1713,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); } +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_EXIT_CONTROLS, val); @@ -2113,6 +2200,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); } +static void decache_tsc_multiplier(struct vcpu_vmx *vmx) +{ + vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2121,22 +2214,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + bool already_loaded = vmx->loaded_vmcs->cpu == cpu; if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (vmx->loaded_vmcs->cpu != cpu) + else if (!already_loaded) loaded_vmcs_clear(vmx->loaded_vmcs); - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } - - if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + if (!already_loaded) { local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2151,6 +2236,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) &per_cpu(loaded_vmcss_on_cpu, cpu)); crash_enable_local_vmclear(cpu); local_irq_enable(); + } + + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); + } + + if (!already_loaded) { + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + unsigned long sysenter_esp; + + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching @@ -2167,10 +2264,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Setup TSC multiplier */ if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) { - vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); - } + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) + decache_tsc_multiplier(vmx); vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); @@ -2419,7 +2514,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) unsigned long *msr_bitmap; if (is_guest_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_nested; + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { @@ -2707,8 +2802,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_PCOMMIT; + SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -2717,13 +2811,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + vmx->nested.nested_vmx_ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; - /* - * For nested guests, we don't do anything specific - * for single context invalidation. Hence, only advertise - * support for global context invalidation. - */ - vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | + VMX_EPT_EXTENT_CONTEXT_BIT; } else vmx->nested.nested_vmx_ept_caps = 0; @@ -2854,7 +2947,6 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) vmx->nested.nested_vmx_secondary_ctls_high); break; case MSR_IA32_VMX_EPT_VPID_CAP: - /* Currently, no nested vpid support */ *pdata = vmx->nested.nested_vmx_ept_caps | ((u64)vmx->nested.nested_vmx_vpid_caps << 32); break; @@ -2865,6 +2957,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -2906,10 +3006,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; - case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu)) + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) return 1; - msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; + case MSR_IA32_FEATURE_CONTROL: + msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -2999,12 +3104,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu) || - (to_vmx(vcpu)->nested.msr_ia32_feature_control & + if (!vmx_feature_control_msr_valid(vcpu, data) || + (to_vmx(vcpu)->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) return 1; - vmx->nested.msr_ia32_feature_control = data; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; @@ -3270,7 +3383,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_PCOMMIT | SECONDARY_EXEC_TSC_SCALING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, @@ -3299,25 +3411,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = VM_EXIT_SAVE_DEBUG_CONTROLS; + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | + PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || - !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -3366,7 +3480,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) /* * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL - * but due to arrata below it can't be used. Workaround is to use + * but due to errata below it can't be used. Workaround is to use * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. * * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] @@ -4783,6 +4897,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) if (!kvm_vcpu_apicv_active(&vmx->vcpu)) pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + /* Enable the preemption timer dynamically */ + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; return pin_based_exec_ctrl; } @@ -4858,9 +4974,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - /* Currently, we allow L1 guest to directly run pcommit instruction. */ - exec_control &= ~SECONDARY_EXEC_PCOMMIT; - return exec_control; } @@ -4901,12 +5014,14 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); + vmx->hv_deadline_tsc = -1; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) + if (cpu_has_secondary_exec_ctrls()) { vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); + } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -6020,12 +6135,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* It is a write fault? */ - error_code = exit_qualification & PFERR_WRITE_MASK; + /* it is a read fault? */ + error_code = (exit_qualification << 2) & PFERR_USER_MASK; + /* it is a write fault? */ + error_code |= exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; + error_code |= (exit_qualification & 0x38) != 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6252,13 +6369,6 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_longmode_x2apic) goto out4; - if (nested) { - vmx_msr_bitmap_nested = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_nested) - goto out5; - } - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out6; @@ -6281,8 +6391,6 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (nested) - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; @@ -6359,9 +6467,6 @@ static __init int hardware_setup(void) for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); - /* According SDM, in x2apic mode, the whole id reg is used. But in - * KVM, it only use the highest eight bits. Need to intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */ @@ -6372,10 +6477,12 @@ static __init int hardware_setup(void) vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? + 0ull : VMX_EPT_READABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else @@ -6397,8 +6504,21 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { + u64 vmx_msr; + + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); + cpu_preemption_timer_multi = + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + } else { + kvm_x86_ops->set_hv_timer = NULL; + kvm_x86_ops->cancel_hv_timer = NULL; + } + kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_mce_cap_supported |= MCG_LMCE_P; + return alloc_kvm_area(); out8: @@ -6406,9 +6526,6 @@ out8: out7: free_page((unsigned long)vmx_vmread_bitmap); out6: - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); -out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); @@ -6434,8 +6551,6 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6866,16 +6981,27 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; } + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + goto out_cached_vmcs12; + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) - return -ENOMEM; + goto out_shadow_vmcs; /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -6895,6 +7021,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; } /* @@ -6946,6 +7081,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, + VMCS12_SIZE); + kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -6964,8 +7104,13 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); + if (vmx->nested.msr_bitmap) { + free_page((unsigned long)vmx->nested.msr_bitmap); + vmx->nested.msr_bitmap = NULL; + } if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); + kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); @@ -7369,6 +7514,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + memcpy(vmx->nested.cached_vmcs12, + vmx->nested.current_vmcs12, VMCS12_SIZE); + if (enable_shadow_vmcs) { vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); @@ -7462,12 +7614,16 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_GLOBAL: + /* + * TODO: track mappings and invalidate + * single context requests appropriately + */ + case VMX_EPT_EXTENT_CONTEXT: kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); nested_vmx_succeed(vcpu); break; default: - /* Trap single context invalidation invept calls */ BUG_ON(1); break; } @@ -7564,10 +7720,9 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static int handle_pcommit(struct kvm_vcpu *vcpu) +static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - /* we never catch pcommit instruct for L1 guest. */ - WARN_ON(1); + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -7621,7 +7776,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, - [EXIT_REASON_PCOMMIT] = handle_pcommit, + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; static const int kvm_vmx_max_exit_handlers = @@ -7930,8 +8085,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PCOMMIT: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); + case EXIT_REASON_PREEMPTION_TIMER: + return false; default: return true; } @@ -8274,6 +8429,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) { u32 sec_exec_control; + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true; + return; + } + /* * There is not point to enable virtualize x2apic without enable * apicv @@ -8317,7 +8478,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) * the next L2->L1 exit. */ if (!is_guest_mode(vcpu) || - !nested_cpu_has2(vmx->nested.current_vmcs12, + !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) vmcs_write64(APIC_ACCESS_ADDR, hpa); } @@ -8450,7 +8611,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "push %[sp]\n\t" #endif "pushf\n\t" - "orl $0x200, (%%" _ASM_SP ")\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" "call *%[entry]\n\t" : @@ -8463,8 +8623,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); - } else - local_irq_enable(); + } } static bool vmx_has_high_real_mode_segbase(void) @@ -8615,6 +8774,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } +void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl; + u32 delta_tsc; + + if (vmx->hv_deadline_tsc == -1) + return; + + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* sure to be 32 bit only because checked on set_hv_timer */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); +} + static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8664,6 +8843,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); + vmx_arm_hv_timer(vcpu); + vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ @@ -8954,6 +9135,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + return &vmx->vcpu; free_vmcs: @@ -9095,14 +9278,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) vmcs_set_secondary_exec_control(secondary_exec_ctl); - if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { - if (guest_cpuid_has_pcommit(vcpu)) - vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_PCOMMIT; - else - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_PCOMMIT; - } + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9307,8 +9488,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, { int msr; struct page *page; - unsigned long *msr_bitmap; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -9317,63 +9500,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, WARN_ON(1); return false; } - msr_bitmap = (unsigned long *)kmap(page); - if (!msr_bitmap) { + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (!msr_bitmap_l1) { nested_release_page_clean(page); WARN_ON(1); return false; } + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) for (msr = 0x800; msr <= 0x8ff; msr++) nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, msr, MSR_TYPE_R); - /* TPR is allowed */ - nested_vmx_disable_intercept_for_msr(msr_bitmap, - vmx_msr_bitmap_nested, + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_TASKPRI >> 4), MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { - /* EOI and self-IPI are allowed */ nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_EOI >> 4), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_SELF_IPI >> 4), MSR_TYPE_W); } - } else { - /* - * Enable reading intercept of all the x2apic - * MSRs. We should not rely on vmcs12 to do any - * optimizations here, it may have been modified - * by L1. - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - msr, - MSR_TYPE_R); - - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); } kunmap(page); nested_release_page_clean(page); @@ -9659,9 +9816,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - exec_control |= vmcs_config.pin_based_exec_ctrl; + + /* Preemption timer setting is only taken from vmcs01. */ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= vmcs_config.pin_based_exec_ctrl; + if (vmx->hv_deadline_tsc == -1) + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { /* * Note that we use L0's vector here and in @@ -9715,8 +9877,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_PCOMMIT); + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -9788,10 +9949,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS) { - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); - /* MSR_BITMAP will be set by following vmx_set_efer. */ - } else + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ + else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; /* @@ -9842,6 +10003,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); else vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); if (enable_vpid) { /* @@ -10580,8 +10743,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); - vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ @@ -10590,8 +10753,22 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if TSC was changed while L2 ran */ + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (vmx->hv_deadline_tsc == -1) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + else + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + + if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { + vmx->nested.change_vmcs01_virtual_x2apic_mode = false; + vmx_set_virtual_x2apic_mode(vcpu, + vcpu->arch.apic_base & X2APIC_ENABLE); + } /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; @@ -10671,6 +10848,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, return X86EMUL_CONTINUE; } +#ifdef CONFIG_X86_64 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */ +static inline int u64_shl_div_u64(u64 a, unsigned int shift, + u64 divisor, u64 *result) +{ + u64 low = a << shift, high = a >> (64 - shift); + + /* To avoid the overflow on divq */ + if (high >= divisor) + return 1; + + /* Low hold the result, high hold rem which is discarded */ + asm("divq %2\n\t" : "=a" (low), "=d" (high) : + "rm" (divisor), "0" (low), "1" (high)); + *result = low; + + return 0; +} + +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 tscl = rdtsc(); + u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; + + /* Convert to host delta tsc if tsc scaling is enabled */ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && + u64_shl_div_u64(delta_tsc, + kvm_tsc_scaling_ratio_frac_bits, + vcpu->arch.tsc_scaling_ratio, + &delta_tsc)) + return -ERANGE; + + /* + * If the delta tsc can't fit in the 32 bit after the multi shift, + * we can't use the preemption timer. + * It's possible that it fits on later vmentries, but checking + * on every vmentry is costly so we just use an hrtimer. + */ + if (delta_tsc >> (cpu_preemption_timer_multi + 32)) + return -ERANGE; + + vmx->hv_deadline_tsc = tscl + delta_tsc; + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + return 0; +} + +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->hv_deadline_tsc = -1; + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); +} +#endif + static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) { if (ple_gap) @@ -10715,7 +10950,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, * this case, return 1, otherwise, return 0. * */ -static int vmx_pre_block(struct kvm_vcpu *vcpu) +static int pi_pre_block(struct kvm_vcpu *vcpu) { unsigned long flags; unsigned int dest; @@ -10782,7 +11017,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) return 0; } -static void vmx_post_block(struct kvm_vcpu *vcpu) +static int vmx_pre_block(struct kvm_vcpu *vcpu) +{ + if (pi_pre_block(vcpu)) + return 1; + + if (kvm_lapic_hv_timer_in_use(vcpu)) + kvm_lapic_switch_to_sw_timer(vcpu); + + return 0; +} + +static void pi_post_block(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; @@ -10824,6 +11070,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) } } +static void vmx_post_block(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->set_hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + + pi_post_block(vcpu); +} + /* * vmx_update_pi_irte - set IRTE for Posted-Interrupts * @@ -10868,7 +11122,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * We will support full lowest-priority interrupt later. */ - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { /* * Make sure the IRTE is in remapped mode if @@ -10913,6 +11167,16 @@ out: return ret; } +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_LMCE; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_LMCE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11037,6 +11301,13 @@ static struct kvm_x86_ops vmx_x86_ops = { .pmu_ops = &intel_pmu_ops, .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2766723c951..699f8726539a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -36,7 +36,8 @@ #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/moduleparam.h> #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> @@ -70,7 +71,8 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) @@ -89,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops __read_mostly; @@ -113,7 +119,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); u64 __read_mostly kvm_max_tsc_scaling_ratio; EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -static u64 __read_mostly kvm_default_tsc_scaling_ratio; +u64 __read_mostly kvm_default_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; @@ -537,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_gpte(pdpte[i]) && + if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; @@ -982,6 +989,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, }; @@ -1161,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec boot; + struct timespec64 boot; if (!wall_clock) return; @@ -1184,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - getboottime(&boot); + getboottime64(&boot); if (kvm->arch.kvmclock_offset) { - struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); - boot = timespec_sub(boot, ts); + struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); + boot = timespec64_sub(boot, ts); } - wc.sec = boot.tv_sec; + wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ wc.nsec = boot.tv_nsec; wc.version = version; @@ -2615,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_X2APIC_API: + r = KVM_X2APIC_API_VALID_FLAGS; + break; default: r = 0; break; @@ -2677,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + if (copy_to_user(argp, &kvm_mce_cap_supported, + sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; @@ -2733,12 +2742,17 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); + if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration @@ -2766,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - - return 0; + return kvm_apic_get_state(vcpu, s); } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_apic_post_state_restore(vcpu, s); + int r; + + r = kvm_apic_set_state(vcpu, s); + if (r) + return r; update_cr8_intercept(vcpu); return 0; @@ -2859,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -2869,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + + if (kvm_x86_ops->setup_mce) + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -3767,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = -EEXIST; if (irqchip_in_kernel(kvm)) goto split_irqchip_unlock; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto split_irqchip_unlock; r = kvm_setup_empty_irq_routing(kvm); if (r) @@ -3781,6 +3800,18 @@ split_irqchip_unlock: mutex_unlock(&kvm->lock); break; } + case KVM_CAP_X2APIC_API: + r = -EINVAL; + if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) + break; + + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) + kvm->arch.x2apic_format = true; + if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + kvm->arch.x2apic_broadcast_quirk_disabled = true; + + r = 0; + break; default: r = -EINVAL; break; @@ -3832,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpic) goto create_irqchip_unlock; r = -EINVAL; - if (atomic_read(&kvm->online_vcpus)) + if (kvm->created_vcpus) goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); @@ -3872,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp, sizeof(struct kvm_pit_config))) goto out; create_pit: - mutex_lock(&kvm->slots_lock); + mutex_lock(&kvm->lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -3881,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.vpit) r = 0; create_pit_unlock: - mutex_unlock(&kvm->slots_lock); + mutex_unlock(&kvm->lock); break; case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ @@ -3988,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); - if (atomic_read(&kvm->online_vcpus) != 0) + if (kvm->created_vcpus) r = -EBUSY; else kvm->arch.bsp_vcpu_id = arg; @@ -5296,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) /* This is a good place to trace that we are exiting SMM. */ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); - if (unlikely(vcpu->arch.smi_pending)) { - kvm_make_request(KVM_REQ_SMI, vcpu); - vcpu->arch.smi_pending = 0; - } else { - /* Process a latched INIT, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - } + /* Process a latched INIT or SMI, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } kvm_mmu_reset_context(vcpu); @@ -5552,9 +5578,10 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void tsc_bad(void *info) +static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); + return 0; } static void tsc_khz_changed(void *data) @@ -5659,35 +5686,18 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = { .notifier_call = kvmclock_cpufreq_notifier }; -static int kvmclock_cpu_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int kvmclock_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, tsc_bad, NULL, 1); - break; - } - return NOTIFY_OK; + tsc_khz_changed(NULL); + return 0; } -static struct notifier_block kvmclock_cpu_notifier_block = { - .notifier_call = kvmclock_cpu_notifier, - .priority = -INT_MAX -}; - static void kvm_timer_init(void) { int cpu; max_tsc_khz = tsc_khz; - cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -5702,12 +5712,9 @@ static void kvm_timer_init(void) CPUFREQ_TRANSITION_NOTIFIER); } pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - for_each_online_cpu(cpu) - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - - __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); - cpu_notifier_register_done(); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE", + kvmclock_cpu_online, kvmclock_cpu_down_prep); } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -5867,8 +5874,8 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -5896,7 +5903,7 @@ void kvm_arch_exit(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); #endif @@ -6102,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } /* try to inject new event if pending */ - if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + if (vcpu->arch.smi_pending && !is_smm(vcpu)) { + vcpu->arch.smi_pending = false; + enter_smm(vcpu); + } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); @@ -6125,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_x86_ops->set_irq(vcpu); } } + return 0; } @@ -6148,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val -static u32 process_smi_get_segment_flags(struct kvm_segment *seg) +static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) { u32 flags = 0; flags |= seg->g << 23; @@ -6162,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg) return flags; } -static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; int offset; @@ -6177,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) put_smstate(u32, buf, offset + 8, seg.base); put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); } #ifdef CONFIG_X86_64 -static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) { struct kvm_segment seg; int offset; @@ -6190,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) kvm_get_segment(vcpu, &seg, n); offset = 0x7e00 + n * 16; - flags = process_smi_get_segment_flags(&seg) >> 8; + flags = enter_smm_get_segment_flags(&seg) >> 8; put_smstate(u16, buf, offset, seg.selector); put_smstate(u16, buf, offset + 2, flags); put_smstate(u32, buf, offset + 4, seg.limit); @@ -6198,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) } #endif -static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) { struct desc_ptr dt; struct kvm_segment seg; @@ -6222,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7fc4, seg.selector); put_smstate(u32, buf, 0x7f64, seg.base); put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); put_smstate(u32, buf, 0x7fc0, seg.selector); put_smstate(u32, buf, 0x7f80, seg.base); put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg)); + put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); kvm_x86_ops->get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); @@ -6239,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f54, dt.size); for (i = 0; i < 6; i++) - process_smi_save_seg_32(vcpu, buf, i); + enter_smm_save_seg_32(vcpu, buf, i); put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); @@ -6248,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); } -static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) +static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) { #ifdef CONFIG_X86_64 struct desc_ptr dt; @@ -6280,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); @@ -6290,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); @@ -6299,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u64, buf, 0x7e68, dt.address); for (i = 0; i < 6; i++) - process_smi_save_seg_64(vcpu, buf, i); + enter_smm_save_seg_64(vcpu, buf, i); #else WARN_ON_ONCE(1); #endif } -static void process_smi(struct kvm_vcpu *vcpu) +static void enter_smm(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; struct desc_ptr dt; char buf[512]; u32 cr0; - if (is_smm(vcpu)) { - vcpu->arch.smi_pending = true; - return; - } - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); vcpu->arch.hflags |= HF_SMM_MASK; memset(buf, 0, 512); if (guest_cpuid_has_longmode(vcpu)) - process_smi_save_state_64(vcpu, buf); + enter_smm_save_state_64(vcpu, buf); else - process_smi_save_state_32(vcpu, buf); + enter_smm_save_state_32(vcpu, buf); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); @@ -6379,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } +static void process_smi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.smi_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); @@ -6573,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (inject_pending_event(vcpu, req_int_win) != 0) req_immediate_exit = true; - /* enable NMI/IRQ window open exits if needed */ else { + /* Enable NMI/IRQ window open exits if needed. + * + * SMIs have two cases: 1) they can be nested, and + * then there is nothing to do here because RSM will + * cause a vmexit anyway; 2) or the SMI can be pending + * because inject_pending_event has completed the + * injection of an IRQ or NMI from the previous vmexit, + * and then we request an immediate exit to inject the SMI. + */ + if (vcpu->arch.smi_pending && !is_smm(vcpu)) + req_immediate_exit = true; if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) @@ -6625,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_xcr0(vcpu); - if (req_immediate_exit) + if (req_immediate_exit) { + kvm_make_request(KVM_REQ_EVENT, vcpu); smp_send_reschedule(vcpu->cpu); + } trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); - __kvm_guest_enter(); + guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6681,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); + guest_exit_irqoff(); + local_irq_enable(); preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -7427,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { vcpu->arch.hflags = 0; + vcpu->arch.smi_pending = 0; atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; @@ -7619,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) -{ - return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu); -} - struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); @@ -7890,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); } @@ -8398,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, /* * When producer of consumer is unregistered, we change back to * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabed or the consumer side (KVM + * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index a3c668875038..216a629a4a1a 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -1,5 +1,5 @@ #include <linux/smp.h> -#include <linux/module.h> +#include <linux/export.h> static void __wbinvd(void *dummy) { diff --git a/arch/x86/lib/cpu.c b/arch/x86/lib/cpu.c index aa417a97511c..d6f848d1211d 100644 --- a/arch/x86/lib/cpu.c +++ b/arch/x86/lib/cpu.c @@ -1,4 +1,5 @@ -#include <linux/module.h> +#include <linux/types.h> +#include <linux/export.h> unsigned int x86_family(unsigned int sig) { diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index 9845371c5c36..9a7fe6a70491 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -6,7 +6,7 @@ */ #include <linux/compiler.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/checksum.h> static inline unsigned short from32to16(unsigned a) diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index b6fcb9a9ddbc..8bd53589ecfb 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -5,7 +5,7 @@ * Wrappers of assembly checksum functions for x86-64. */ #include <asm/checksum.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <asm/smap.h> diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 2f07c291dcc8..073d1f1a620b 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -11,7 +11,7 @@ * we have to worry about. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/timex.h> #include <linux/preempt.h> diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S index 02de3d74d2c5..8a602a1e404a 100644 --- a/arch/x86/lib/hweight.S +++ b/arch/x86/lib/hweight.S @@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32) ENTRY(__sw_hweight64) #ifdef CONFIG_X86_64 + pushq %rdi pushq %rdx movq %rdi, %rdx # w -> t @@ -60,6 +61,7 @@ ENTRY(__sw_hweight64) shrq $56, %rax # w = w_tmp >> 56 popq %rdx + popq %rdi ret #else /* CONFIG_X86_32 */ /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c index f7dfeda83e5c..121f59c6ee54 100644 --- a/arch/x86/lib/kaslr.c +++ b/arch/x86/lib/kaslr.c @@ -19,7 +19,7 @@ #include <asm/cpufeature.h> #include <asm/setup.h> -#define debug_putstr(v) early_printk(v) +#define debug_putstr(v) early_printk("%s", v) #define has_cpuflag(f) boot_cpu_has(f) #define get_boot_seed() kaslr_offset() #endif diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index a404b4b75533..cad12634d6bd 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -1,5 +1,5 @@ #include <linux/string.h> -#include <linux/module.h> +#include <linux/export.h> #undef memcpy #undef memset diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index e5e3ed8dc079..c2311a678332 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -18,7 +18,7 @@ */ #include <linux/hardirq.h> #include <linux/string.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/sched.h> #include <linux/types.h> diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index 8d6ef78b5d01..ff29e8d39414 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/export.h> #include <asm/msr.h> EXPORT_SYMBOL(rdmsr_safe_regs); diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index 518532e6a3fa..ce68b6a9d7d1 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -1,4 +1,4 @@ -#include <linux/module.h> +#include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> #include <asm/msr.h> diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index 004c861b1648..d1dee753b949 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -1,4 +1,5 @@ -#include <linux/module.h> +#include <linux/export.h> +#include <linux/percpu.h> #include <linux/preempt.h> #include <asm/msr.h> #define CREATE_TRACE_POINTS diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index bd59090825db..dc0ad12f80bb 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -11,7 +11,7 @@ */ #include <linux/string.h> -#include <linux/module.h> +#include <linux/export.h> #ifdef __HAVE_ARCH_STRCPY char *strcpy(char *dest, const char *src) diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index e342586db6e4..b4908789484e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -5,7 +5,7 @@ */ #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/export.h> #include <asm/word-at-a-time.h> #include <linux/sched.h> diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index b559d9238781..3bc7baf2a711 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -8,7 +8,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/blkdev.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/backing-dev.h> #include <linux/interrupt.h> #include <asm/uaccess.h> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 9f760cdcaf40..69873589c0ba 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -5,7 +5,7 @@ * Copyright 1997 Linus Torvalds * Copyright 2002 Andi Kleen <ak@suse.de> */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/uaccess.h> /* diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index ec378cd7b71e..767be7c76034 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1012,7 +1012,7 @@ GrpTable: Grp15 4: XSAVE 5: XRSTOR | lfence (11B) 6: XSAVEOPT | clwb (66) | mfence (11B) -7: clflush | clflushopt (66) | sfence (11B) | pcommit (66),(11B) +7: clflush | clflushopt (66) | sfence (11B) EndTable GrpTable: Grp16 diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c index 2ca15b59fb3f..ba47524f56e8 100644 --- a/arch/x86/mm/amdtopology.c +++ b/arch/x86/mm/amdtopology.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/module.h> #include <linux/nodemask.h> #include <linux/memblock.h> #include <linux/bootmem.h> diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 9a17250bcbe0..ea9c49adaa1f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -14,7 +14,7 @@ #include <linux/debugfs.h> #include <linux/mm.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/seq_file.h> #include <asm/pgtable.h> @@ -454,8 +454,4 @@ static int __init pt_dump_init(void) return 0; } - __initcall(pt_dump_init); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); -MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index a6d739258137..6d18b70ed5a9 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,5 +1,5 @@ #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/swap.h> /* for totalram_pages */ #include <linux/bootmem.h> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index ec21796ac5fd..4473cb4f8b90 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -3,15 +3,17 @@ * included by both the compressed kernel and the regular kernel. */ -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page, +static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page, unsigned long addr, unsigned long end) { addr &= PMD_MASK; for (; addr < end; addr += PMD_SIZE) { pmd_t *pmd = pmd_page + pmd_index(addr); - if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | pmd_flag)); + if (pmd_present(*pmd)) + continue; + + set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag)); } } @@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, if (pud_present(*pud)) { pmd = pmd_offset(pud, 0); - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); continue; } pmd = (pmd_t *)info->alloc_pgt_page(info->context); if (!pmd) return -ENOMEM; - ident_pmd_init(info->pmd_flag, pmd, addr, next); + ident_pmd_init(info, pmd, addr, next); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } @@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, } int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, - unsigned long addr, unsigned long end) + unsigned long pstart, unsigned long pend) { + unsigned long addr = pstart + info->offset; + unsigned long end = pend + info->offset; unsigned long next; int result; - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0; for (; addr < end; addr = next) { - pgd_t *pgd = pgd_page + pgd_index(addr) + off; + pgd_t *pgd = pgd_page + pgd_index(addr); pud_t *pud; next = (addr & PGDIR_MASK) + PGDIR_SIZE; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index cc82830bc8c4..d28a2d741f9e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num) return __va(pfn << PAGE_SHIFT); } -/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ -#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) +/* + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. + * With KASLR memory randomization, depending on the machine e820 memory + * and the PUD alignment. We may need twice more pages when KASLR memory + * randomization is enabled. + */ +#ifndef CONFIG_RANDOMIZE_MEMORY +#define INIT_PGD_PAGE_COUNT 6 +#else +#define INIT_PGD_PAGE_COUNT 12 +#endif +#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { @@ -208,7 +218,7 @@ static int __meminit save_mr(struct map_range *mr, int nr_range, * adjust the page_size_mask for small range to go with * big page size instead small one if nearby are ram too. */ -static void __init_refok adjust_range_page_size_mask(struct map_range *mr, +static void __ref adjust_range_page_size_mask(struct map_range *mr, int nr_range) { int i; @@ -396,7 +406,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ -unsigned long __init_refok init_memory_mapping(unsigned long start, +unsigned long __ref init_memory_mapping(unsigned long start, unsigned long end) { struct map_range mr[NR_RANGE_MR]; @@ -700,13 +710,6 @@ void free_initmem(void) void __init free_initrd_mem(unsigned long start, unsigned long end) { /* - * Remember, initrd memory may contain microcode or other useful things. - * Before we lose initrd mem, we need to find a place to hold them - * now that normal virtual memory is enabled. - */ - save_microcode_in_initrd(); - - /* * end could be not aligned, and We can not align that, * decompresser could be confused by aligned initrd_end * We already reserve the end partial page before in diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 84df150ee77e..cf8059016ec8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -5,7 +5,6 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 */ -#include <linux/module.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 53cc2256cf23..14b9dd71d9e8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -27,7 +27,6 @@ #include <linux/pfn.h> #include <linux/poison.h> #include <linux/dma-mapping.h> -#include <linux/module.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> #include <linux/memremap.h> diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 9c0ff045fdd4..ada98b39b8ad 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -18,7 +18,7 @@ #include <asm/iomap.h> #include <asm/pat.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/highmem.h> static int is_io_mapping_possible(resource_size_t base, unsigned long size) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index f0894910bdd7..7aaa2635862d 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -9,7 +9,6 @@ #include <linux/bootmem.h> #include <linux/init.h> #include <linux/io.h> -#include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mmiotrace.h> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 26dccd6c0df1..bda8d5eef04d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -77,7 +77,7 @@ static inline unsigned long get_padding(struct kaslr_memory_region *region) */ static inline bool kaslr_memory_enabled(void) { - return kaslr_enabled() && !config_enabled(CONFIG_KASAN); + return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } /* Initialize base and padding for each memory region randomized with KASLR */ @@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void) * add padding if needed (especially for memory hotplug support). */ BUG_ON(kaslr_regions[0].base != &page_offset_base); - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b4f2e7e9e907..4515bae36bbe 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -14,7 +14,6 @@ #include <linux/kernel.h> #include <linux/kmemcheck.h> #include <linux/mm.h> -#include <linux/module.h> #include <linux/page-flags.h> #include <linux/percpu.h> #include <linux/ptrace.h> diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c index aec124214d97..c2638a7d2c10 100644 --- a/arch/x86/mm/kmemcheck/shadow.c +++ b/arch/x86/mm/kmemcheck/shadow.c @@ -1,5 +1,5 @@ #include <linux/kmemcheck.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <asm/page.h> diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index ddb2244b06a1..afc47f5c9531 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -11,7 +11,7 @@ #include <linux/rculist.h> #include <linux/spinlock.h> #include <linux/hash.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/ptrace.h> diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 0057a7accfb1..bef36622e408 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -24,7 +24,7 @@ #define DEBUG 1 -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/slab.h> #include <linux/uaccess.h> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 968ac028c34e..fb682108f4dc 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -8,7 +8,6 @@ #include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/ctype.h> -#include <linux/module.h> #include <linux/nodemask.h> #include <linux/sched.h> #include <linux/topology.h> diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 47b6436e41c2..6b7ce6279133 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -24,7 +24,7 @@ #include <linux/bootmem.h> #include <linux/memblock.h> -#include <linux/module.h> +#include <linux/init.h> #include "numa_internal.h" diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09fa4f0..e3353c97d086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa, } } -static int populate_pmd(struct cpa_data *cpa, - unsigned long start, unsigned long end, - unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) { - unsigned int cur_pages = 0; + long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa, return num_pages; } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) { pud_t *pud; unsigned long end; - int cur_pages = 0; + long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* Map trailing leftover */ if (start < end) { - int tmp; + long tmp; pud = pud_offset(pgd, start); if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ pgd_t *pgd_entry; - int ret; + long ret; pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { - int ret, numpages = cpa->numpages; + unsigned long numpages = cpa->numpages; + int ret; while (numpages) { /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index db00e3e2f3dc..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -11,7 +11,6 @@ #include <linux/bootmem.h> #include <linux/debugfs.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/pfn_t.h> #include <linux/slab.h> #include <linux/mm.h> @@ -928,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -939,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -998,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1012,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 2f7702253ccf..de391b7bc19a 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -11,7 +11,6 @@ #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/rbtree_augmented.h> #include <linux/sched.h> #include <linux/gfp.h> diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 9f0614daea85..a235869532bc 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -26,7 +26,6 @@ * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 */ -#include <linux/module.h> #include <linux/ptrace.h> /* struct pt_regs */ #include "pf_in.h" diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index e67ae0e6c59d..9adce776852b 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -8,7 +8,6 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/spinlock.h> -#include <linux/module.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index e666cbbb9261..cfc3b9121ce4 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,6 +1,6 @@ #include <linux/bootmem.h> #include <linux/mmdebug.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <asm/page.h> diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index b1ecff460a46..35fe69529bc1 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -13,7 +13,7 @@ #include <linux/acpi.h> #include <linux/mmzone.h> #include <linux/bitmap.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/topology.h> #include <linux/mm.h> #include <asm/proto.h> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5643fd0b1a7d..4dbe65622810 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -4,7 +4,7 @@ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/interrupt.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/cpu.h> #include <asm/tlbflush.h> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 8196054fedb0..7b6a9d14c8c0 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -133,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev) if (pci_probe & PCI_NOASSIGN_BARS) { /* * If the BIOS did not assign the BAR, zero out the - * resource so the kernel doesn't attmept to assign + * resource so the kernel doesn't attempt to assign * it later on in pci_assign_unassigned_resources */ for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) { diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 837ea36a837d..6d52b94f4bb9 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -553,15 +553,21 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); /* - * Broadwell EP Home Agent BARs erroneously return non-zero values when read. + * Device [8086:2fc0] + * Erratum HSE43 + * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html * - * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html - * entry BDF2. + * Devices [8086:6f60,6fa0,6fc0] + * Erratum BDF2 + * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html */ -static void pci_bdwep_bar(struct pci_dev *dev) +static void pci_invalid_bar(struct pci_dev *dev) { dev->non_compliant_bars = 1; } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 5ceda85b8687..052c1cb76305 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -169,7 +169,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr; diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c index 613cac7395c4..7948be342ee9 100644 --- a/arch/x86/pci/vmd.c +++ b/arch/x86/pci/vmd.c @@ -41,6 +41,7 @@ static DEFINE_RAW_SPINLOCK(list_lock); * @node: list item for parent traversal. * @rcu: RCU callback item for freeing. * @irq: back pointer to parent. + * @enabled: true if driver enabled IRQ * @virq: the virtual IRQ value provided to the requesting driver. * * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to @@ -50,6 +51,7 @@ struct vmd_irq { struct list_head node; struct rcu_head rcu; struct vmd_irq_list *irq; + bool enabled; unsigned int virq; }; @@ -119,10 +121,13 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) static void vmd_irq_enable(struct irq_data *data) { struct vmd_irq *vmdirq = data->chip_data; + unsigned long flags; - raw_spin_lock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); + WARN_ON(vmdirq->enabled); list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list); - raw_spin_unlock(&list_lock); + vmdirq->enabled = true; + raw_spin_unlock_irqrestore(&list_lock, flags); data->chip->irq_unmask(data); } @@ -130,12 +135,16 @@ static void vmd_irq_enable(struct irq_data *data) static void vmd_irq_disable(struct irq_data *data) { struct vmd_irq *vmdirq = data->chip_data; + unsigned long flags; data->chip->irq_mask(data); - raw_spin_lock(&list_lock); - list_del_rcu(&vmdirq->node); - raw_spin_unlock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); + if (vmdirq->enabled) { + list_del_rcu(&vmdirq->node); + vmdirq->enabled = false; + } + raw_spin_unlock_irqrestore(&list_lock, flags); } /* @@ -166,16 +175,20 @@ static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info, * XXX: We can be even smarter selecting the best IRQ once we solve the * affinity problem. */ -static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd) +static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc) { - int i, best = 0; + int i, best = 1; + unsigned long flags; + + if (!desc->msi_attrib.is_msix || vmd->msix_count == 1) + return &vmd->irqs[0]; - raw_spin_lock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); for (i = 1; i < vmd->msix_count; i++) if (vmd->irqs[i].count < vmd->irqs[best].count) best = i; vmd->irqs[best].count++; - raw_spin_unlock(&list_lock); + raw_spin_unlock_irqrestore(&list_lock, flags); return &vmd->irqs[best]; } @@ -184,14 +197,15 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus); + struct msi_desc *desc = arg->desc; + struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus); struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL); if (!vmdirq) return -ENOMEM; INIT_LIST_HEAD(&vmdirq->node); - vmdirq->irq = vmd_next_irq(vmd); + vmdirq->irq = vmd_next_irq(vmd, desc); vmdirq->virq = virq; irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip, @@ -203,11 +217,12 @@ static void vmd_msi_free(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq) { struct vmd_irq *vmdirq = irq_get_chip_data(virq); + unsigned long flags; /* XXX: Potential optimization to rebalance */ - raw_spin_lock(&list_lock); + raw_spin_lock_irqsave(&list_lock, flags); vmdirq->irq->count--; - raw_spin_unlock(&list_lock); + raw_spin_unlock_irqrestore(&list_lock, flags); kfree_rcu(vmdirq, rcu); } @@ -261,18 +276,18 @@ static struct device *to_vmd_dev(struct device *dev) static struct dma_map_ops *vmd_dma_ops(struct device *dev) { - return to_vmd_dev(dev)->archdata.dma_ops; + return get_dma_ops(to_vmd_dev(dev)); } static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr, - gfp_t flag, struct dma_attrs *attrs) + gfp_t flag, unsigned long attrs) { return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag, attrs); } static void vmd_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t addr, struct dma_attrs *attrs) + dma_addr_t addr, unsigned long attrs) { return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr, attrs); @@ -280,7 +295,7 @@ static void vmd_free(struct device *dev, size_t size, void *vaddr, static int vmd_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t addr, size_t size, - struct dma_attrs *attrs) + unsigned long attrs) { return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr, size, attrs); @@ -288,7 +303,7 @@ static int vmd_mmap(struct device *dev, struct vm_area_struct *vma, static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t addr, size_t size, - struct dma_attrs *attrs) + unsigned long attrs) { return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr, addr, size, attrs); @@ -297,26 +312,26 @@ static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt, static dma_addr_t vmd_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size, dir, attrs); } static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs); } static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs); } static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs); } @@ -367,7 +382,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd) { struct dma_domain *domain = &vmd->dma_domain; - if (vmd->dev->dev.archdata.dma_ops) + if (get_dma_ops(&vmd->dev->dev)) del_dma_domain(domain); } @@ -379,7 +394,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd) static void vmd_setup_dma_ops(struct vmd_dev *vmd) { - const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops; + const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev); struct dma_map_ops *dest = &vmd->dma_ops; struct dma_domain *domain = &vmd->dma_domain; @@ -594,7 +609,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd) sd->node = pcibus_to_node(vmd->dev->bus); vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info, - NULL); + x86_vector_domain); if (!vmd->irq_domain) return -ENODEV; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 99ddab79215e..3a483cb5ac81 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -9,7 +9,7 @@ * Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> * Stefano Stabellini <stefano.stabellini@eu.citrix.com> */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/acpi.h> diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 701fd5843c87..b27bccd4390f 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -11,11 +11,9 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/irq.h> -#include <linux/module.h> #include <linux/reboot.h> #include <linux/serial_reg.h> #include <linux/serial_8250.h> -#include <linux/reboot.h> #include <asm/ce4100.h> #include <asm/prom.h> diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c index 524142117296..5fdacb322ceb 100644 --- a/arch/x86/platform/efi/early_printk.c +++ b/arch/x86/platform/efi/early_printk.c @@ -44,7 +44,7 @@ early_initcall(early_efi_map_fb); * In case earlyprintk=efi,keep we have the whole framebuffer mapped already * so just return the offset efi_fb + start. */ -static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +static __ref void *early_efi_map(unsigned long start, unsigned long len) { unsigned long base; @@ -56,7 +56,7 @@ static __init_refok void *early_efi_map(unsigned long start, unsigned long len) return early_ioremap(base + start, len); } -static __init_refok void early_efi_unmap(void *addr, unsigned long len) +static __ref void early_efi_unmap(void *addr, unsigned long len) { if (!efi_fb) early_iounmap(addr, len); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 17c8bbd4e2f0..1fbb408e2e72 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -51,7 +51,6 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/x86_init.h> -#include <asm/rtc.h> #include <asm/uv/uv.h> static struct efi efi_phys __initdata; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 3e12c44f88a2..8dd3784eb075 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -24,7 +24,8 @@ #include <linux/spinlock.h> #include <linux/bootmem.h> #include <linux/ioport.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/mc146818rtc.h> #include <linux/efi.h> #include <linux/uaccess.h> #include <linux/io.h> @@ -244,7 +245,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * text and allocate a new stack because we can't rely on the * stack pointer being < 4GB. */ - if (!IS_ENABLED(CONFIG_EFI_MIXED)) + if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native()) return 0; /* diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 4480c06cade7..89d1146f5a6f 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -254,6 +254,7 @@ void __init efi_free_boot_services(void) for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && md->type != EFI_BOOT_SERVICES_DATA) @@ -263,6 +264,26 @@ void __init efi_free_boot_services(void) if (md->attribute & EFI_MEMORY_RUNTIME) continue; + /* + * Nasty quirk: if all sub-1MB memory is used for boot + * services, we can get here without having allocated the + * real mode trampoline. It's too late to hand boot services + * memory back to the memblock allocator, so instead + * try to manually allocate the trampoline if needed. + * + * I've seen this on a Dell XPS 13 9350 with firmware + * 1.4.4 with SGX enabled booting Linux via Fedora 24's + * grub2-efi on a hard disk. (And no, I don't know why + * this happened, but Linux should still try to boot rather + * panicing early.) + */ + rm_size = real_mode_size_needed(); + if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { + set_real_mode_mem(start, rm_size); + start += rm_size; + size -= rm_size; + } + free_bootmem_late(start, size); } diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index abbf49c6e9d3..ce119d2ba0d0 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -20,7 +20,7 @@ #include <linux/scatterlist.h> #include <linux/sfi.h> #include <linux/irq.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/notifier.h> #include <asm/setup.h> diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index ee40fcb6e54d..58024862a7eb 100644 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/sfi.h> #include <linux/platform_device.h> +#include <linux/mc146818rtc.h> #include <asm/intel-mid.h> #include <asm/intel_mid_vrtc.h> diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 5bc90dd102d4..c901a3423772 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -21,10 +21,9 @@ #include <linux/delay.h> #include <linux/errno.h> -#include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mutex.h> #include <linux/pci.h> @@ -407,7 +406,6 @@ static const struct pci_device_id mid_pwr_pci_ids[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info }, {} }; -MODULE_DEVICE_TABLE(pci, mid_pwr_pci_ids); static struct pci_driver mid_pwr_pci_driver = { .name = "intel_mid_pwr", diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c index 1555672d436f..051d264fce2e 100644 --- a/arch/x86/platform/intel-mid/sfi.c +++ b/arch/x86/platform/intel-mid/sfi.c @@ -24,7 +24,7 @@ #include <linux/input.h> #include <linux/platform_device.h> #include <linux/irq.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/notifier.h> #include <linux/mmc/core.h> #include <linux/mmc/card.h> diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index 27376081ddec..7c3077e58fa0 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -12,7 +12,7 @@ #include <linux/kernel.h> #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/io.h> #include <linux/string.h> diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c index e7604f62870d..f1aab8cdb33f 100644 --- a/arch/x86/platform/olpc/olpc_ofw.c +++ b/arch/x86/platform/olpc/olpc_ofw.c @@ -1,9 +1,12 @@ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/spinlock_types.h> #include <linux/init.h> #include <asm/page.h> #include <asm/setup.h> #include <asm/io.h> +#include <asm/cpufeature.h> +#include <asm/special_insns.h> #include <asm/pgtable.h> #include <asm/olpc_ofw.h> diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c index baf16e72e668..fd39301f25ac 100644 --- a/arch/x86/platform/ts5500/ts5500.c +++ b/arch/x86/platform/ts5500/ts5500.c @@ -23,7 +23,7 @@ #include <linux/io.h> #include <linux/kernel.h> #include <linux/leds.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/platform_data/gpio-ts5500.h> #include <linux/platform_data/max197.h> #include <linux/platform_device.h> @@ -345,7 +345,3 @@ error: return err; } device_initcall(ts5500_init); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Savoir-faire Linux Inc. <kernel@savoirfairelinux.com>"); -MODULE_DESCRIPTION("Technologic Systems TS-5500 platform driver"); diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 66b2166ea4a1..23f2f3e41c7f 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -187,7 +187,8 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); void uv_bios_init(void) { uv_systab = NULL; - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) { + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + !efi.uv_systab || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); return; } @@ -199,12 +200,14 @@ void uv_bios_init(void) return; } + /* Starting with UV4 the UV systab size is variable */ if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { + int size = uv_systab->size; + iounmap(uv_systab); - uv_systab = ioremap(efi.uv_systab, uv_systab->size); + uv_systab = ioremap(efi.uv_systab, size); if (!uv_systab) { - pr_err("UV: UVsystab: ioremap(%d) failed!\n", - uv_systab->size); + pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); return; } } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index e1c24631afbb..776c6592136c 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -8,7 +8,7 @@ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/rbtree.h> #include <linux/slab.h> #include <linux/irq.h> diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 8dd80050d705..cd5173a2733f 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -24,7 +24,7 @@ #include <linux/kdb.h> #include <linux/kexec.h> #include <linux/kgdb.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/nmi.h> #include <linux/sched.h> #include <linux/slab.h> diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f2b5e6a5cf95..9634557a5444 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -37,11 +37,11 @@ unsigned long jump_address_phys; */ unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt __visible; +unsigned long temp_level4_pgt __visible; unsigned long relocated_restore_code __visible; -static int set_up_temporary_text_mapping(void) +static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; @@ -71,7 +71,7 @@ static int set_up_temporary_text_mapping(void) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(temp_level4_pgt + pgd_index(restore_jump_address), + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); return 0; @@ -87,18 +87,19 @@ static int set_up_temporary_mappings(void) struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, - .kernel_mapping = true, + .offset = __PAGE_OFFSET, }; unsigned long mstart, mend; + pgd_t *pgd; int result; int i; - temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); - if (!temp_level4_pgt) + pgd = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!pgd) return -ENOMEM; /* Prepare a temporary mapping for the kernel text */ - result = set_up_temporary_text_mapping(); + result = set_up_temporary_text_mapping(pgd); if (result) return result; @@ -107,13 +108,12 @@ static int set_up_temporary_mappings(void) mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; - result = kernel_ident_mapping_init(&info, temp_level4_pgt, - mstart, mend); - + result = kernel_ident_mapping_init(&info, pgd, mstart, mend); if (result) return result; } + temp_level4_pgt = __pa(pgd); return 0; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index 3177c2bc26f6..ce8da3a0412c 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -24,7 +24,6 @@ #include <asm/frame.h> ENTRY(swsusp_arch_suspend) - FRAME_BEGIN movq $saved_context, %rax movq %rsp, pt_regs_sp(%rax) movq %rbp, pt_regs_bp(%rax) @@ -48,6 +47,7 @@ ENTRY(swsusp_arch_suspend) movq %cr3, %rax movq %rax, restore_cr3(%rip) + FRAME_BEGIN call swsusp_save FRAME_END ret @@ -72,8 +72,6 @@ ENTRY(restore_image) /* code below has been relocated to a safe page */ ENTRY(core_restore_code) /* switch to temporary page tables */ - movq $__PAGE_OFFSET, %rcx - subq %rcx, %rax movq %rax, %cr3 /* flush TLB */ movq %rbx, %rcx @@ -104,7 +102,6 @@ ENTRY(core_restore_code) /* code below belongs to the image kernel */ .align PAGE_SIZE ENTRY(restore_registers) - FRAME_BEGIN /* go back to the original page tables */ movq %r9, %cr3 @@ -145,6 +142,5 @@ ENTRY(restore_registers) /* tell the hibernation core that we've just restored the memory */ movq %rax, in_suspend(%rip) - FRAME_END ret ENDPROC(restore_registers) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 12734a96df47..ac58c1616408 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,8 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KCOV_INSTRUMENT := n + # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not # sure how to relocate those. Like kexec-tools, use custom flags. diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 705e3fffb4a1..5db706f14111 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -1,9 +1,11 @@ #include <linux/io.h> +#include <linux/slab.h> #include <linux/memblock.h> #include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/realmode.h> +#include <asm/tlbflush.h> struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -11,25 +13,37 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void __init set_real_mode_mem(phys_addr_t mem, size_t size) +{ + void *base = __va(mem); + + real_mode_header = (struct real_mode_header *) base; + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + base, (unsigned long long)mem, size); +} + void __init reserve_real_mode(void) { phys_addr_t mem; - unsigned char *base; - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); + size_t size = real_mode_size_needed(); + + if (!size) + return; + + WARN_ON(slab_is_available()); /* Has to be under 1M so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); + if (!mem) { + pr_info("No sub-1M memory is available for the trampoline\n"); + return; + } - base = __va(mem); memblock_reserve(mem, size); - real_mode_header = (struct real_mode_header *) base; - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - base, (unsigned long long)mem, size); + set_real_mode_mem(mem, size); } -void __init setup_real_mode(void) +static void __init setup_real_mode(void) { u16 real_mode_seg; const u32 *rel; @@ -84,7 +98,7 @@ void __init setup_real_mode(void) trampoline_header->start = (u64) secondary_startup_64; trampoline_cr4_features = &trampoline_header->cr4; - *trampoline_cr4_features = __read_cr4(); + *trampoline_cr4_features = mmu_cr4_features; trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); trampoline_pgd[0] = trampoline_pgd_entry.pgd; @@ -100,7 +114,7 @@ void __init setup_real_mode(void) * need to mark it executable at do_pre_smp_initcalls() at least, * thus run it as a early_initcall(). */ -static int __init set_real_mode_permissions(void) +static void __init set_real_mode_permissions(void) { unsigned char *base = (unsigned char *) real_mode_header; size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); @@ -119,7 +133,16 @@ static int __init set_real_mode_permissions(void) set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); +} + +static int __init init_real_mode(void) +{ + if (!real_mode_header) + panic("Real mode trampoline was not allocated"); + + setup_real_mode(); + set_real_mode_permissions(); return 0; } -early_initcall(set_real_mode_permissions); +early_initcall(init_real_mode); diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index c556c5ae8de5..25012abc3409 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -48,7 +48,7 @@ targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h LDFLAGS_realmode.elf := --emit-relocs -T -CPPFLAGS_realmode.lds += -P -C -I$(obj) +CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj) targets += realmode.elf $(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE diff --git a/arch/x86/um/delay.c b/arch/x86/um/delay.c index f3fe1a688f7e..a8fb7ca4822b 100644 --- a/arch/x86/um/delay.c +++ b/arch/x86/um/delay.c @@ -7,7 +7,7 @@ * published by the Free Software Foundation. */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kernel.h> #include <linux/delay.h> #include <asm/param.h> diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ebd4dd6ef73b..a7ef7b131e25 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -84,7 +84,10 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + break; case ORIG_EAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: if (value && (value & 3) != 3) diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index faab418876ce..0b5c184dd5b3 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -78,7 +78,11 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case RSI: case RDI: case RBP: + break; + case ORIG_RAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 6c803ca49b5d..d72dec406ccb 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -2,6 +2,9 @@ # Building vDSO images for x86. # +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + VDSO64-y := y vdso-install-$(VDSO64-y) += vdso.so diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index c8377fb26cdf..1daff5545c0a 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -1,7 +1,6 @@ #include <linux/init.h> #include <linux/debugfs.h> #include <linux/slab.h> -#include <linux/module.h> #include "debugfs.h" diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 69b4b6d29738..b86ebb1a9a7f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -23,7 +23,7 @@ #include <linux/sched.h> #include <linux/kprobes.h> #include <linux/bootmem.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> @@ -34,9 +34,7 @@ #include <linux/edd.h> #include <linux/frame.h> -#ifdef CONFIG_KEXEC_CORE #include <linux/kexec.h> -#endif #include <xen/xen.h> #include <xen/events.h> @@ -120,7 +118,7 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ -DEFINE_PER_CPU(int, xen_vcpu_id) = -1; +DEFINE_PER_CPU(uint32_t, xen_vcpu_id); EXPORT_PER_CPU_SYMBOL(xen_vcpu_id); enum xen_domain_type xen_domain_type = XEN_NATIVE; @@ -1334,7 +1332,8 @@ static void xen_crash_shutdown(struct pt_regs *regs) static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - xen_reboot(SHUTDOWN_crash); + if (!kexec_crash_loaded()) + xen_reboot(SHUTDOWN_crash); return NOTIFY_DONE; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 67433714b791..7d5afdb417cc 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -43,7 +43,8 @@ #include <linux/debugfs.h> #include <linux/bug.h> #include <linux/vmalloc.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/gfp.h> #include <linux/memblock.h> #include <linux/seq_file.h> diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index dd2a49a8aacc..37129db76d33 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -60,7 +60,7 @@ */ #include <linux/init.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/sched.h> diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 9586ff32810c..d37a0c7f82cb 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -21,7 +21,7 @@ #include <linux/init.h> #include <linux/io.h> -#include <linux/module.h> +#include <linux/export.h> #include <xen/platform_pci.h> #include "xen-ops.h" diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e345891450c3..176425233e4d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -4,7 +4,7 @@ * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ -#include <linux/module.h> +#include <linux/init.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/pm.h> |