diff options
Diffstat (limited to 'arch/x86')
81 files changed, 1121 insertions, 1179 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f1dbb4ee19d7..6d4774f203d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,7 +63,7 @@ config X86 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 - select ARCH_HAS_UACCESS_MCSAFE if X86_64 + select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX @@ -180,7 +180,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR select HAVE_STACK_VALIDATION if X86_64 select HAVE_RSEQ diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a08e82856563..7e3c07d6ad42 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -80,11 +80,6 @@ ifeq ($(CONFIG_X86_32),y) # alignment instructions. KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) - # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use - # a lot more stack due to the lack of sharing of stacklots: - KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ - $(call cc-option,-fno-unit-at-a-time)) - # CPU-specific tuning. Anything which can be shared with UML should go here. include arch/x86/Makefile_32.cpu KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 0d41d68131cc..2e1382486e91 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -17,6 +17,7 @@ #define _LINUX_BITOPS_H /* Inhibit inclusion of <linux/bitops.h> */ #include <linux/types.h> +#include <asm/asm.h> static inline bool constant_test_bit(int nr, const void *addr) { @@ -28,7 +29,7 @@ static inline bool variable_test_bit(int nr, const void *addr) bool v; const u32 *p = (const u32 *)addr; - asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); + asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr)); return v; } diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..169c2feda14a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -106,9 +106,13 @@ define cmd_check_data_rel done endef +# We need to run two commands under "if_changed", so merge them into a +# single invocation. +quiet_cmd_check-and-link-vmlinux = LD $@ + cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) + $(obj)/vmlinux: $(vmlinux-objs-y) FORCE - $(call if_changed,check_data_rel) - $(call if_changed,ld) + $(call if_changed,check-and-link-vmlinux) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index e98522ea6f09..1458b1700fc7 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -34,74 +34,13 @@ static void setup_boot_services##bits(struct efi_config *c) \ \ table = (typeof(table))sys_table; \ \ - c->runtime_services = table->runtime; \ - c->boot_services = table->boottime; \ - c->text_output = table->con_out; \ + c->runtime_services = table->runtime; \ + c->boot_services = table->boottime; \ + c->text_output = table->con_out; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); -static inline efi_status_t __open_volume32(void *__image, void **__fh) -{ - efi_file_io_interface_t *io; - efi_loaded_image_32_t *image = __image; - efi_file_handle_32_t *fh; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - efi_status_t status; - void *handle = (void *)(unsigned long)image->device_handle; - unsigned long func; - - status = efi_call_early(handle_protocol, handle, - &fs_proto, (void **)&io); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to handle fs_proto\n"); - return status; - } - - func = (unsigned long)io->open_volume; - status = efi_early->call(func, io, &fh); - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to open volume\n"); - - *__fh = fh; - return status; -} - -static inline efi_status_t __open_volume64(void *__image, void **__fh) -{ - efi_file_io_interface_t *io; - efi_loaded_image_64_t *image = __image; - efi_file_handle_64_t *fh; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - efi_status_t status; - void *handle = (void *)(unsigned long)image->device_handle; - unsigned long func; - - status = efi_call_early(handle_protocol, handle, - &fs_proto, (void **)&io); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to handle fs_proto\n"); - return status; - } - - func = (unsigned long)io->open_volume; - status = efi_early->call(func, io, &fh); - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to open volume\n"); - - *__fh = fh; - return status; -} - -efi_status_t -efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) -{ - if (efi_early->is64) - return __open_volume64(__image, __fh); - - return __open_volume32(__image, __fh); -} - void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { efi_call_proto(efi_simple_text_output_protocol, output_string, @@ -109,7 +48,7 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } static efi_status_t -__setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) +preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) { struct pci_setup_rom *rom = NULL; efi_status_t status; @@ -134,16 +73,16 @@ __setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for rom\n"); + efi_printk(sys_table, "Failed to allocate memory for 'rom'\n"); return status; } memset(rom, 0, sizeof(*rom)); - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; + rom->data.type = SETUP_PCI; + rom->data.len = size - sizeof(struct setup_data); + rom->data.next = 0; + rom->pcilen = pci->romsize; *__rom = rom; status = efi_call_proto(efi_pci_io_protocol, pci.read, pci, @@ -179,96 +118,6 @@ free_struct: return status; } -static void -setup_efi_pci32(struct boot_params *params, void **pci_handle, - unsigned long size) -{ - efi_pci_io_protocol_t *pci = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 *handles = (u32 *)(unsigned long)pci_handle; - efi_status_t status; - unsigned long nr_pci; - struct setup_data *data; - int i; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - nr_pci = size / sizeof(u32); - for (i = 0; i < nr_pci; i++) { - struct pci_setup_rom *rom = NULL; - u32 h = handles[i]; - - status = efi_call_early(handle_protocol, h, - &pci_proto, (void **)&pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - - status = __setup_efi_pci(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - } -} - -static void -setup_efi_pci64(struct boot_params *params, void **pci_handle, - unsigned long size) -{ - efi_pci_io_protocol_t *pci = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - u64 *handles = (u64 *)(unsigned long)pci_handle; - efi_status_t status; - unsigned long nr_pci; - struct setup_data *data; - int i; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - nr_pci = size / sizeof(u64); - for (i = 0; i < nr_pci; i++) { - struct pci_setup_rom *rom = NULL; - u64 h = handles[i]; - - status = efi_call_early(handle_protocol, h, - &pci_proto, (void **)&pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - - status = __setup_efi_pci(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - } -} - /* * There's no way to return an informative status from this function, * because any analysis (and printing of error messages) needs to be @@ -284,6 +133,9 @@ static void setup_efi_pci(struct boot_params *params) void **pci_handle = NULL; efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; unsigned long size = 0; + unsigned long nr_pci; + struct setup_data *data; + int i; status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, @@ -295,7 +147,7 @@ static void setup_efi_pci(struct boot_params *params) size, (void **)&pci_handle); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for pci_handle\n"); + efi_printk(sys_table, "Failed to allocate memory for 'pci_handle'\n"); return; } @@ -307,10 +159,34 @@ static void setup_efi_pci(struct boot_params *params) if (status != EFI_SUCCESS) goto free_handle; - if (efi_early->is64) - setup_efi_pci64(params, pci_handle, size); - else - setup_efi_pci32(params, pci_handle, size); + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + nr_pci = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32)); + for (i = 0; i < nr_pci; i++) { + efi_pci_io_protocol_t *pci = NULL; + struct pci_setup_rom *rom; + + status = efi_call_early(handle_protocol, + efi_is_64bit() ? ((u64 *)pci_handle)[i] + : ((u32 *)pci_handle)[i], + &pci_proto, (void **)&pci); + if (status != EFI_SUCCESS || !pci) + continue; + + status = preserve_pci_rom_image(pci, &rom); + if (status != EFI_SUCCESS) + continue; + + if (data) + data->next = (unsigned long)rom; + else + params->hdr.setup_data = (unsigned long)rom; + + data = (struct setup_data *)rom; + } free_handle: efi_call_early(free_pool, pci_handle); @@ -341,8 +217,7 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size + sizeof(struct setup_data), &new); if (status != EFI_SUCCESS) { - efi_printk(sys_table, - "Failed to alloc mem for properties\n"); + efi_printk(sys_table, "Failed to allocate memory for 'properties'\n"); return; } @@ -358,9 +233,9 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) new->next = 0; data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; - if (!data) + if (!data) { boot_params->hdr.setup_data = (unsigned long)new; - else { + } else { while (data->next) data = (struct setup_data *)(unsigned long)data->next; data->next = (unsigned long)new; @@ -380,81 +255,55 @@ static void setup_quirks(struct boot_params *boot_params) } } +/* + * See if we have Universal Graphics Adapter (UGA) protocol + */ static efi_status_t -setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) +setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size) { - struct efi_uga_draw_protocol *uga = NULL, *first_uga; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; + efi_status_t status; + u32 width, height; + void **uga_handle = NULL; + efi_uga_draw_protocol_t *uga = NULL, *first_uga; unsigned long nr_ugas; - u32 *handles = (u32 *)uga_handle; - efi_status_t status = EFI_INVALID_PARAMETER; int i; - first_uga = NULL; - nr_ugas = size / sizeof(u32); - for (i = 0; i < nr_ugas; i++) { - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 w, h, depth, refresh; - void *pciio; - u32 handle = handles[i]; - - status = efi_call_early(handle_protocol, handle, - &uga_proto, (void **)&uga); - if (status != EFI_SUCCESS) - continue; - - efi_call_early(handle_protocol, handle, &pciio_proto, &pciio); - - status = efi_early->call((unsigned long)uga->get_mode, uga, - &w, &h, &depth, &refresh); - if (status == EFI_SUCCESS && (!first_uga || pciio)) { - *width = w; - *height = h; - - /* - * Once we've found a UGA supporting PCIIO, - * don't bother looking any further. - */ - if (pciio) - break; - - first_uga = uga; - } - } + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&uga_handle); + if (status != EFI_SUCCESS) + return status; - return status; -} + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + uga_proto, NULL, &size, uga_handle); + if (status != EFI_SUCCESS) + goto free_handle; -static efi_status_t -setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) -{ - struct efi_uga_draw_protocol *uga = NULL, *first_uga; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; - unsigned long nr_ugas; - u64 *handles = (u64 *)uga_handle; - efi_status_t status = EFI_INVALID_PARAMETER; - int i; + height = 0; + width = 0; first_uga = NULL; - nr_ugas = size / sizeof(u64); + nr_ugas = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32)); for (i = 0; i < nr_ugas; i++) { efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; u32 w, h, depth, refresh; void *pciio; - u64 handle = handles[i]; + unsigned long handle = efi_is_64bit() ? ((u64 *)uga_handle)[i] + : ((u32 *)uga_handle)[i]; status = efi_call_early(handle_protocol, handle, - &uga_proto, (void **)&uga); + uga_proto, (void **)&uga); if (status != EFI_SUCCESS) continue; + pciio = NULL; efi_call_early(handle_protocol, handle, &pciio_proto, &pciio); - status = efi_early->call((unsigned long)uga->get_mode, uga, - &w, &h, &depth, &refresh); + status = efi_call_proto(efi_uga_draw_protocol, get_mode, uga, + &w, &h, &depth, &refresh); if (status == EFI_SUCCESS && (!first_uga || pciio)) { - *width = w; - *height = h; + width = w; + height = h; /* * Once we've found a UGA supporting PCIIO, @@ -467,59 +316,28 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) } } - return status; -} - -/* - * See if we have Universal Graphics Adapter (UGA) protocol - */ -static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, - unsigned long size) -{ - efi_status_t status; - u32 width, height; - void **uga_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&uga_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - uga_proto, NULL, &size, uga_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - height = 0; - width = 0; - - if (efi_early->is64) - status = setup_uga64(uga_handle, size, &width, &height); - else - status = setup_uga32(uga_handle, size, &width, &height); - if (!width && !height) goto free_handle; /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; + si->orig_video_isVGA = VIDEO_TYPE_EFI; - si->lfb_depth = 32; - si->lfb_width = width; - si->lfb_height = height; + si->lfb_depth = 32; + si->lfb_width = width; + si->lfb_height = height; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; free_handle: efi_call_early(free_pool, uga_handle); + return status; } @@ -586,7 +404,7 @@ struct boot_params *make_boot_params(struct efi_config *c) if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) return NULL; - if (efi_early->is64) + if (efi_is_64bit()) setup_boot_services64(efi_early); else setup_boot_services32(efi_early); @@ -601,7 +419,7 @@ struct boot_params *make_boot_params(struct efi_config *c) status = efi_low_alloc(sys_table, 0x4000, 1, (unsigned long *)&boot_params); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc lowmem for boot params\n"); + efi_printk(sys_table, "Failed to allocate lowmem for boot params\n"); return NULL; } @@ -617,9 +435,9 @@ struct boot_params *make_boot_params(struct efi_config *c) * Fill out some of the header fields ourselves because the * EFI firmware loader doesn't load the first sector. */ - hdr->root_flags = 1; - hdr->vid_mode = 0xffff; - hdr->boot_flag = 0xAA55; + hdr->root_flags = 1; + hdr->vid_mode = 0xffff; + hdr->boot_flag = 0xAA55; hdr->type_of_loader = 0x21; @@ -627,6 +445,7 @@ struct boot_params *make_boot_params(struct efi_config *c) cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size); if (!cmdline_ptr) goto fail; + hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; /* Fill in upper bits of command line address, NOP on 32 bit */ boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; @@ -663,10 +482,12 @@ struct boot_params *make_boot_params(struct efi_config *c) boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; return boot_params; + fail2: efi_free(sys_table, options_size, hdr->cmd_line_ptr); fail: efi_free(sys_table, 0x4000, (unsigned long)boot_params); + return NULL; } @@ -678,7 +499,7 @@ static void add_e820ext(struct boot_params *params, unsigned long size; e820ext->type = SETUP_E820_EXT; - e820ext->len = nr_entries * sizeof(struct boot_e820_entry); + e820ext->len = nr_entries * sizeof(struct boot_e820_entry); e820ext->next = 0; data = (struct setup_data *)(unsigned long)params->hdr.setup_data; @@ -692,8 +513,8 @@ static void add_e820ext(struct boot_params *params, params->hdr.setup_data = (unsigned long)e820ext; } -static efi_status_t setup_e820(struct boot_params *params, - struct setup_data *e820ext, u32 e820ext_size) +static efi_status_t +setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size) { struct boot_e820_entry *entry = params->e820_table; struct efi_info *efi = ¶ms->efi_info; @@ -814,11 +635,10 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, } struct exit_boot_struct { - struct boot_params *boot_params; - struct efi_info *efi; - struct setup_data *e820ext; - __u32 e820ext_size; - bool is64; + struct boot_params *boot_params; + struct efi_info *efi; + struct setup_data *e820ext; + __u32 e820ext_size; }; static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, @@ -845,25 +665,25 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, first = false; } - signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE + : EFI32_LOADER_SIGNATURE; memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - p->efi->efi_systab = (unsigned long)sys_table_arg; - p->efi->efi_memdesc_size = *map->desc_size; - p->efi->efi_memdesc_version = *map->desc_ver; - p->efi->efi_memmap = (unsigned long)*map->map; - p->efi->efi_memmap_size = *map->map_size; + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; #ifdef CONFIG_X86_64 - p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; - p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; #endif return EFI_SUCCESS; } -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle, bool is64) +static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) { unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; @@ -874,17 +694,16 @@ static efi_status_t exit_boot(struct boot_params *boot_params, struct efi_boot_memmap map; struct exit_boot_struct priv; - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; - priv.boot_params = boot_params; - priv.efi = &boot_params->efi_info; - priv.e820ext = NULL; - priv.e820ext_size = 0; - priv.is64 = is64; + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; /* Might as well exit boot services now */ status = efi_exit_boot_services(sys_table, handle, &map, &priv, @@ -892,10 +711,11 @@ static efi_status_t exit_boot(struct boot_params *boot_params, if (status != EFI_SUCCESS) return status; - e820ext = priv.e820ext; - e820ext_size = priv.e820ext_size; + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; + /* Historic? */ - boot_params->alt_mem_k = 32 * 1024; + boot_params->alt_mem_k = 32 * 1024; status = setup_e820(boot_params, e820ext, e820ext_size); if (status != EFI_SUCCESS) @@ -908,8 +728,8 @@ static efi_status_t exit_boot(struct boot_params *boot_params, * On success we return a pointer to a boot_params structure, and NULL * on failure. */ -struct boot_params *efi_main(struct efi_config *c, - struct boot_params *boot_params) +struct boot_params * +efi_main(struct efi_config *c, struct boot_params *boot_params) { struct desc_ptr *gdt = NULL; efi_loaded_image_t *image; @@ -918,13 +738,11 @@ struct boot_params *efi_main(struct efi_config *c, struct desc_struct *desc; void *handle; efi_system_table_t *_table; - bool is64; efi_early = c; _table = (efi_system_table_t *)(unsigned long)efi_early->table; handle = (void *)(unsigned long)efi_early->image_handle; - is64 = efi_early->is64; sys_table = _table; @@ -932,7 +750,7 @@ struct boot_params *efi_main(struct efi_config *c, if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) goto fail; - if (is64) + if (efi_is_64bit()) setup_boot_services64(efi_early); else setup_boot_services32(efi_early); @@ -957,7 +775,7 @@ struct boot_params *efi_main(struct efi_config *c, status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for gdt structure\n"); + efi_printk(sys_table, "Failed to allocate memory for 'gdt' structure\n"); goto fail; } @@ -965,7 +783,7 @@ struct boot_params *efi_main(struct efi_config *c, status = efi_low_alloc(sys_table, gdt->size, 8, (unsigned long *)&gdt->address); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for gdt\n"); + efi_printk(sys_table, "Failed to allocate memory for 'gdt'\n"); goto fail; } @@ -988,7 +806,7 @@ struct boot_params *efi_main(struct efi_config *c, hdr->code32_start = bzimage_addr; } - status = exit_boot(boot_params, handle, is64); + status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { efi_printk(sys_table, "exit_boot() failed!\n"); goto fail; @@ -1002,19 +820,20 @@ struct boot_params *efi_main(struct efi_config *c, if (IS_ENABLED(CONFIG_X86_64)) { /* __KERNEL32_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; } else { /* Second entry is unused on 32-bit */ @@ -1022,15 +841,16 @@ struct boot_params *efi_main(struct efi_config *c, } /* __KERNEL_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; desc->d = 0; @@ -1038,41 +858,41 @@ struct boot_params *efi_main(struct efi_config *c, desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; } - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; /* __KERNEL_DS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; if (IS_ENABLED(CONFIG_X86_64)) { /* Task segment value */ - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; } @@ -1082,5 +902,6 @@ struct boot_params *efi_main(struct efi_config *c, return boot_params; fail: efi_printk(sys_table, "efi_main() failed!\n"); + return NULL; } diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index e799dc5c6448..8297387c4676 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -12,22 +12,22 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -struct efi_uga_draw_protocol_32 { +typedef struct { u32 get_mode; u32 set_mode; u32 blt; -}; +} efi_uga_draw_protocol_32_t; -struct efi_uga_draw_protocol_64 { +typedef struct { u64 get_mode; u64 set_mode; u64 blt; -}; +} efi_uga_draw_protocol_64_t; -struct efi_uga_draw_protocol { +typedef struct { void *get_mode; void *set_mode; void *blt; -}; +} efi_uga_draw_protocol_t; #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index b87a7582853d..302517929932 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -102,7 +102,7 @@ static bool memmap_too_large; /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ -unsigned long long mem_limit = ULLONG_MAX; +static unsigned long long mem_limit = ULLONG_MAX; enum mem_avoid_index { @@ -215,7 +215,36 @@ static void mem_avoid_memmap(char *str) memmap_too_large = true; } -static int handle_mem_memmap(void) +/* Store the number of 1GB huge pages which users specified: */ +static unsigned long max_gb_huge_pages; + +static void parse_gb_huge_pages(char *param, char *val) +{ + static bool gbpage_sz; + char *p; + + if (!strcmp(param, "hugepagesz")) { + p = val; + if (memparse(p, &p) != PUD_SIZE) { + gbpage_sz = false; + return; + } + + if (gbpage_sz) + warn("Repeatedly set hugeTLB page size of 1G!\n"); + gbpage_sz = true; + return; + } + + if (!strcmp(param, "hugepages") && gbpage_sz) { + p = val; + max_gb_huge_pages = simple_strtoull(p, &p, 0); + return; + } +} + + +static int handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); size_t len = strlen((char *)args); @@ -223,7 +252,8 @@ static int handle_mem_memmap(void) char *param, *val; u64 mem_size; - if (!strstr(args, "memmap=") && !strstr(args, "mem=")) + if (!strstr(args, "memmap=") && !strstr(args, "mem=") && + !strstr(args, "hugepages")) return 0; tmp_cmdline = malloc(len + 1); @@ -248,6 +278,8 @@ static int handle_mem_memmap(void) if (!strcmp(param, "memmap")) { mem_avoid_memmap(val); + } else if (strstr(param, "hugepages")) { + parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { char *p = val; @@ -387,7 +419,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ /* Mark the memmap regions we need to avoid */ - handle_mem_memmap(); + handle_mem_options(); #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ @@ -466,6 +498,60 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) } } +/* + * Skip as many 1GB huge pages as possible in the passed region + * according to the number which users specified: + */ +static void +process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) +{ + unsigned long addr, size = 0; + struct mem_vector tmp; + int i = 0; + + if (!max_gb_huge_pages) { + store_slot_info(region, image_size); + return; + } + + addr = ALIGN(region->start, PUD_SIZE); + /* Did we raise the address above the passed in memory entry? */ + if (addr < region->start + region->size) + size = region->size - (addr - region->start); + + /* Check how many 1GB huge pages can be filtered out: */ + while (size > PUD_SIZE && max_gb_huge_pages) { + size -= PUD_SIZE; + max_gb_huge_pages--; + i++; + } + + /* No good 1GB huge pages found: */ + if (!i) { + store_slot_info(region, image_size); + return; + } + + /* + * Skip those 'i'*1GB good huge pages, and continue checking and + * processing the remaining head or tail part of the passed region + * if available. + */ + + if (addr >= region->start + image_size) { + tmp.start = region->start; + tmp.size = addr - region->start; + store_slot_info(&tmp, image_size); + } + + size = region->size - (addr - region->start) - i * PUD_SIZE; + if (size >= image_size) { + tmp.start = addr + i * PUD_SIZE; + tmp.size = size; + store_slot_info(&tmp, image_size); + } +} + static unsigned long slots_fetch_random(void) { unsigned long slot; @@ -546,7 +632,7 @@ static void process_mem_region(struct mem_vector *entry, /* If nothing overlaps, store the region and return. */ if (!mem_avoid_overlap(®ion, &overlap)) { - store_slot_info(®ion, image_size); + process_gb_huge_pages(®ion, image_size); return; } @@ -556,7 +642,7 @@ static void process_mem_region(struct mem_vector *entry, beginning.start = region.start; beginning.size = overlap.start - region.start; - store_slot_info(&beginning, image_size); + process_gb_huge_pages(&beginning, image_size); } /* Return if overlap extends to or past end of region. */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 8c5107545251..9e2157371491 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,3 +1,4 @@ +#include <asm/e820/types.h> #include <asm/processor.h> #include "pgtable.h" #include "../string.h" @@ -34,10 +35,62 @@ unsigned long *trampoline_32bit __section(.data); extern struct boot_params *boot_params; int cmdline_find_option_bool(const char *option); +static unsigned long find_trampoline_placement(void) +{ + unsigned long bios_start, ebda_start; + unsigned long trampoline_start; + struct boot_e820_entry *entry; + int i; + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Find the first usable memory region under bios_start. */ + for (i = boot_params->e820_entries - 1; i >= 0; i--) { + entry = &boot_params->e820_table[i]; + + /* Skip all entries above bios_start. */ + if (bios_start <= entry->addr) + continue; + + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + + /* Adjust bios_start to the end of the entry if needed. */ + if (bios_start > entry->addr + entry->size) + bios_start = entry->addr + entry->size; + + /* Keep bios_start page-aligned. */ + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Skip the entry if it's too small. */ + if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr) + continue; + + break; + } + + /* Place the trampoline just below the end of low memory */ + return bios_start - TRAMPOLINE_32BIT_SIZE; +} + struct paging_config paging_prepare(void *rmode) { struct paging_config paging_config = {}; - unsigned long bios_start, ebda_start; /* Initialize boot_params. Required for cmdline_find_option_bool(). */ boot_params = rmode; @@ -61,23 +114,7 @@ struct paging_config paging_prepare(void *rmode) paging_config.l5_required = 1; } - /* - * Find a suitable spot for the trampoline. - * This code is based on reserve_bios_regions(). - */ - - ebda_start = *(unsigned short *)0x40e << 4; - bios_start = *(unsigned short *)0x413 << 10; - - if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) - bios_start = BIOS_START_MAX; - - if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) - bios_start = ebda_start; - - /* Place the trampoline just below the end of low memory, aligned to 4k */ - paging_config.trampoline_start = bios_start - TRAMPOLINE_32BIT_SIZE; - paging_config.trampoline_start = round_down(paging_config.trampoline_start, PAGE_SIZE); + paging_config.trampoline_start = find_trampoline_placement(); trampoline_32bit = (unsigned long *)paging_config.trampoline_start; diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 16f49123d747..c4428a176973 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -13,6 +13,7 @@ */ #include <linux/types.h> +#include <asm/asm.h> #include "ctype.h" #include "string.h" @@ -28,8 +29,8 @@ int memcmp(const void *s1, const void *s2, size_t len) { bool diff; - asm("repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm("repe; cmpsb" CC_SET(nz) + : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 717bf0776421..5f7e43d4f64a 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -75,7 +75,7 @@ * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG, MSG mov LEN, %r8 diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 5de7c0d46edf..acd11b3bf639 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128_aesni_alg, diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S index 4eda2b8db9e1..491dd61c845c 100644 --- a/arch/x86/crypto/aegis128l-aesni-asm.S +++ b/arch/x86/crypto/aegis128l-aesni-asm.S @@ -66,7 +66,7 @@ * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG0, MSG0 pxor MSG1, MSG1 diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 876e4866e633..2071c3d1ae07 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis128l_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis128l_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis128l_aesni_alg, diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S index 32aae8397268..8870c7c5d9a4 100644 --- a/arch/x86/crypto/aegis256-aesni-asm.S +++ b/arch/x86/crypto/aegis256-aesni-asm.S @@ -59,7 +59,7 @@ * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG, MSG mov LEN, %r8 diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 2b5dd3af8f4d..b5f2a8fd5a71 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -375,16 +375,12 @@ static struct aead_alg crypto_aegis256_aesni_alg[] = { } }; -static const struct x86_cpu_id aesni_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AES), - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); - static int __init crypto_aegis256_aesni_module_init(void) { - if (!x86_match_cpu(aesni_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_AES) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_aegis256_aesni_alg, diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index e762ef417562..9bd139569b41 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -258,7 +258,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff .macro GCM_INIT Iv SUBKEY AAD AADLEN mov \AADLEN, %r11 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length - xor %r11, %r11 + xor %r11d, %r11d mov %r11, InLen(%arg2) # ctx_data.in_length = 0 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 @@ -286,7 +286,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff movdqu HashKey(%arg2), %xmm13 add %arg5, InLen(%arg2) - xor %r11, %r11 # initialise the data pointer offset as zero + xor %r11d, %r11d # initialise the data pointer offset as zero PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation sub %r11, %arg5 # sub partial block data used @@ -702,7 +702,7 @@ _no_extra_mask_1_\@: # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %rax,%rax + xor %eax, %eax mov %rax, PBlockLen(%arg2) jmp _dec_done_\@ @@ -737,7 +737,7 @@ _no_extra_mask_2_\@: # GHASH computation for the last <16 Byte block GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - xor %rax,%rax + xor %eax, %eax mov %rax, PBlockLen(%arg2) jmp _encode_done_\@ diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index faecb1518bf8..1985ea0b551b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -463,7 +463,7 @@ _get_AAD_rest_final\@: _get_AAD_done\@: # initialize the data pointer offset as zero - xor %r11, %r11 + xor %r11d, %r11d # start AES for num_initial_blocks blocks mov arg5, %rax # rax = *Y0 @@ -1770,7 +1770,7 @@ _get_AAD_rest_final\@: _get_AAD_done\@: # initialize the data pointer offset as zero - xor %r11, %r11 + xor %r11d, %r11d # start AES for num_initial_blocks blocks mov arg5, %rax # rax = *Y0 diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S index 07653d4582a6..de182c460f82 100644 --- a/arch/x86/crypto/morus1280-avx2-asm.S +++ b/arch/x86/crypto/morus1280-avx2-asm.S @@ -113,7 +113,7 @@ ENDPROC(__morus1280_update_zero) * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d vpxor MSG, MSG, MSG mov %rcx, %r8 diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c index f111f36d26dc..6634907d6ccd 100644 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); -static const struct x86_cpu_id avx2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_AVX2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id); - static int __init crypto_morus1280_avx2_module_init(void) { - if (!x86_match_cpu(avx2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_avx2_algs, diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S index bd1aa1b60869..da5d2905db60 100644 --- a/arch/x86/crypto/morus1280-sse2-asm.S +++ b/arch/x86/crypto/morus1280-sse2-asm.S @@ -235,7 +235,7 @@ ENDPROC(__morus1280_update_zero) * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG_LO, MSG_LO pxor MSG_HI, MSG_HI diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index 839270aa713c..95cf857d2cbb 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus1280_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus1280_sse2_algs, diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S index efa02816d921..414db480250e 100644 --- a/arch/x86/crypto/morus640-sse2-asm.S +++ b/arch/x86/crypto/morus640-sse2-asm.S @@ -113,7 +113,7 @@ ENDPROC(__morus640_update_zero) * %r9 */ __load_partial: - xor %r9, %r9 + xor %r9d, %r9d pxor MSG, MSG mov %rcx, %r8 diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 26b47e2db8d2..615fb7bc9a32 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -37,15 +37,11 @@ asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); -static const struct x86_cpu_id sse2_cpu_id[] = { - X86_FEATURE_MATCH(X86_FEATURE_XMM2), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id); - static int __init crypto_morus640_sse2_module_init(void) { - if (!x86_match_cpu(sse2_cpu_id)) + if (!boot_cpu_has(X86_FEATURE_XMM2) || + !boot_cpu_has(X86_FEATURE_OSXSAVE) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return crypto_register_aeads(crypto_morus640_sse2_algs, diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S index 6204bd53528c..613d0bfc3d84 100644 --- a/arch/x86/crypto/sha1_ssse3_asm.S +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -96,7 +96,7 @@ # cleanup workspace mov $8, %ecx mov %rsp, %rdi - xor %rax, %rax + xor %eax, %eax rep stosq mov %rbp, %rsp # deallocate workspace diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73a522d53b53..957dfb693ecc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -92,7 +92,7 @@ END(native_usergs_sysret64) .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9, EFLAGS(%rsp) /* interrupts off? */ + btl $9, EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -408,6 +408,7 @@ ENTRY(ret_from_fork) 1: /* kernel thread */ + UNWIND_HINT_EMPTY movq %r12, %rdi CALL_NOSPEC %rbx /* @@ -701,7 +702,7 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9, EFLAGS(%rsp) /* were interrupts off? */ + btl $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f 0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f @@ -981,7 +982,7 @@ ENTRY(\sym) call \do_sym - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif END(\sym) .endm @@ -1222,7 +1223,6 @@ END(paranoid_exit) /* * Save all registers in pt_regs, and switch GS if needed. - * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) UNWIND_HINT_FUNC @@ -1269,7 +1269,6 @@ ENTRY(error_entry) * for these here too. */ .Lerror_kernelspace: - incl %ebx leaq native_irq_return_iret(%rip), %rcx cmpq %rcx, RIP+8(%rsp) je .Lerror_bad_iret @@ -1303,28 +1302,20 @@ ENTRY(error_entry) /* * Pretend that the exception came from user mode: set up pt_regs - * as if we faulted immediately after IRET and clear EBX so that - * error_exit knows that we will be returning to user mode. + * as if we faulted immediately after IRET. */ mov %rsp, %rdi call fixup_bad_iret mov %rax, %rsp - decl %ebx jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) - -/* - * On entry, EBX is a "return to kernel mode" flag: - * 1: already in kernel mode, don't need SWAPGS - * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode - */ ENTRY(error_exit) UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %ebx, %ebx - jnz retint_kernel + testb $3, CS(%rsp) + jz retint_kernel jmp retint_user END(error_exit) diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 261802b1cc50..b9ed1aa53a26 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -58,9 +58,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(src hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ -define cmd_vdso2c - $(obj)/vdso2c $< $(<:%.dbg=%) $@ -endef + cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE $(call if_changed,vdso2c) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4b98101209a1..d50bb4dc0650 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -579,7 +579,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); struct perf_event *event = pcpu->event; - struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event *hwc; struct perf_sample_data data; struct perf_raw_record raw; struct pt_regs regs; @@ -602,6 +602,10 @@ fail: return 0; } + if (WARN_ON_ONCE(!event)) + goto fail; + + hwc = &event->hw; msr = hwc->config_base; buf = ibs_data.regs; rdmsrl(msr, *buf); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 707b2a96e516..035c37481f57 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2041,15 +2041,15 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; } x86_pmu_disable_event(event); - - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); } static void intel_pmu_del_event(struct perf_event *event) @@ -2068,17 +2068,19 @@ static void intel_pmu_read_event(struct perf_event *event) x86_perf_event_update(event); } -static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) +static void intel_pmu_enable_fixed(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx - INTEL_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; + u64 ctrl_val, mask, bits = 0; /* - * Enable IRQ generation (0x8), + * Enable IRQ generation (0x8), if not PEBS, * and enable ring-3 counting (0x2) and ring-0 counting (0x1) * if requested: */ - bits = 0x8ULL; + if (!event->attr.precise_ip) + bits |= 0x8; if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) @@ -2120,14 +2122,14 @@ static void intel_pmu_enable_event(struct perf_event *event) if (unlikely(event_is_checkpointed(event))) cpuc->intel_cp_status |= (1ull << hwc->idx); + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_enable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc); + intel_pmu_enable_fixed(event); return; } - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_enable(event); - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } @@ -2280,7 +2282,10 @@ again: * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + status &= ~cpuc->pebs_enabled; + else + status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); /* * PEBS overflow sets bit 62 in the global status register @@ -2997,6 +3002,9 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); + + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { @@ -4069,7 +4077,6 @@ __init int intel_pmu_init(void) intel_pmu_lbr_init_skl(); x86_pmu.event_constraints = intel_slm_event_constraints; - x86_pmu.pebs_constraints = intel_glp_pebs_event_constraints; x86_pmu.extra_regs = intel_glm_extra_regs; /* * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS @@ -4079,6 +4086,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; x86_pmu.cpu_events = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8a10a045b57b..b7b01d762d32 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -408,9 +408,11 @@ static int alloc_bts_buffer(int cpu) ds->bts_buffer_base = (unsigned long) cea; ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; - max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); - ds->bts_absolute_maximum = ds->bts_buffer_base + max; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + (max / 16) * BTS_RECORD_SIZE; return 0; } @@ -711,12 +713,6 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; -struct event_constraint intel_glp_pebs_event_constraints[] = { - /* Allow all events as PEBS with no flags */ - INTEL_ALL_EVENT_CONSTRAINT(0, 0xf), - EVENT_CONSTRAINT_END -}; - struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -869,6 +865,13 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) } } + /* + * Extended PEBS support + * Makes the PEBS code search the normal constraints. + */ + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + return NULL; + return &emptyconstraint; } @@ -894,10 +897,16 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; u64 threshold; + int reserved; + + if (x86_pmu.flags & PMU_FL_PEBS_ALL) + reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed; + else + reserved = x86_pmu.max_pebs_events; if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + reserved * x86_pmu.pebs_record_size; } else { threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; } @@ -961,7 +970,11 @@ void intel_pmu_pebs_enable(struct perf_event *event) * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - ds->pebs_event_reset[hwc->idx] = + unsigned int idx = hwc->idx; + + if (idx >= INTEL_PMC_IDX_FIXED) + idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); + ds->pebs_event_reset[idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } else { ds->pebs_event_reset[hwc->idx] = 0; @@ -1184,16 +1197,20 @@ static void setup_pebs_sample_data(struct perf_event *event, } /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happend between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + + /* * We use the interrupt regs as a base because the PEBS record does not * contain a full regs set, specifically it seems to lack segment * descriptors, which get used by things like user_mode(). * * In the simple case fix up only the IP for PERF_SAMPLE_IP. - * - * We must however always use BP,SP from iregs for the unwinder to stay - * sane; the record BP,SP can point into thin air when the record is - * from a previous PMI context or an (I)RET happend between the record - * and PMI. */ *regs = *iregs; @@ -1212,15 +1229,8 @@ static void setup_pebs_sample_data(struct perf_event *event, regs->si = pebs->si; regs->di = pebs->di; - /* - * Per the above; only set BP,SP if we don't need callchains. - * - * XXX: does this make sense? - */ - if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) { - regs->bp = pebs->bp; - regs->sp = pebs->sp; - } + regs->bp = pebs->bp; + regs->sp = pebs->sp; #ifndef CONFIG_X86_32 regs->r8 = pebs->r8; @@ -1482,9 +1492,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; - short counts[MAX_PEBS_EVENTS] = {}; - short error[MAX_PEBS_EVENTS] = {}; - int bit, i; + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + short error[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + int bit, i, size; + u64 mask; if (!x86_pmu.pebs_active) return; @@ -1494,6 +1505,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) ds->pebs_index = ds->pebs_buffer_base; + mask = (1ULL << x86_pmu.max_pebs_events) - 1; + size = x86_pmu.max_pebs_events; + if (x86_pmu.flags & PMU_FL_PEBS_ALL) { + mask |= ((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED; + size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + } + if (unlikely(base >= top)) { /* * The drain_pebs() could be called twice in a short period @@ -1503,7 +1521,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * update the event->count for this case. */ for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, - x86_pmu.max_pebs_events) { + size) { event = cpuc->events[bit]; if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_save_and_restart_reload(event, 0); @@ -1516,12 +1534,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) u64 pebs_status; pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + pebs_status &= mask; /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { for_each_set_bit(bit, (unsigned long *)&pebs_status, - x86_pmu.max_pebs_events) + size) counts[bit]++; continue; @@ -1569,7 +1587,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) counts[bit]++; } - for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { + for (bit = 0; bit < size; bit++) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index cf372b90557e..f3e006bed9a7 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -216,6 +216,8 @@ static void intel_pmu_lbr_reset_64(void) void intel_pmu_lbr_reset(void) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (!x86_pmu.lbr_nr) return; @@ -223,6 +225,9 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_32(); else intel_pmu_lbr_reset_64(); + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; } /* @@ -334,6 +339,7 @@ static inline u64 rdlbr_to(unsigned int idx) static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; unsigned lbr_idx, mask; u64 tos; @@ -344,9 +350,21 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) return; } - mask = x86_pmu.lbr_nr - 1; tos = task_ctx->tos; - for (i = 0; i < tos; i++) { + /* + * Does not restore the LBR registers, if + * - No one else touched them, and + * - Did not enter C6 + */ + if ((task_ctx == cpuc->last_task_ctx) && + (task_ctx->log_id == cpuc->last_log_id) && + rdlbr_from(tos)) { + task_ctx->lbr_stack_state = LBR_NONE; + return; + } + + mask = x86_pmu.lbr_nr - 1; + for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); @@ -354,14 +372,24 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + + for (; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrlbr_from(lbr_idx, 0); + wrlbr_to(lbr_idx, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); unsigned lbr_idx, mask; - u64 tos; + u64 tos, from; int i; if (task_ctx->lbr_callstack_users == 0) { @@ -371,15 +399,22 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < tos; i++) { + for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - task_ctx->lbr_from[i] = rdlbr_from(lbr_idx); + from = rdlbr_from(lbr_idx); + if (!from) + break; + task_ctx->lbr_from[i] = from; task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->valid_lbrs = i; task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; + + cpuc->last_task_ctx = task_ctx; + cpuc->last_log_id = ++task_ctx->log_id; } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) @@ -531,7 +566,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) */ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { - bool need_info = false; + bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); @@ -542,7 +577,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (cpuc->lbr_sel) { need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO); if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; + call_stack = true; } for (i = 0; i < num; i++) { @@ -555,6 +590,13 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) from = rdlbr_from(lbr_idx); to = rdlbr_to(lbr_idx); + /* + * Read LBR call stack entries + * until invalid entry (0s) is detected. + */ + if (call_stack && !from) + break; + if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index c9e1e0bef3c3..e17ab885b1e9 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -28,7 +28,7 @@ #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 3 +#define UNCORE_EXTRA_PCI_DEV_MAX 4 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 87dc0263a2e1..51d7c117e3c7 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1029,6 +1029,7 @@ void snbep_uncore_cpu_init(void) enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, + BDX_PCI_QPI_PORT2_FILTER, HSWEP_PCI_PCU_3, }; @@ -3286,15 +3287,18 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { }, { /* QPI Port 0 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), }, { /* QPI Port 1 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), }, { /* QPI Port 2 filter */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + BDX_PCI_QPI_PORT2_FILTER), }, { /* PCU.3 (for Capability registers) */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9f3711470ec1..156286335351 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -163,6 +163,7 @@ struct intel_excl_cntrs { unsigned core_id; /* per-core: core id */ }; +struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 enum { @@ -214,6 +215,8 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; u64 br_sel; + struct x86_perf_task_context *last_task_ctx; + int last_log_id; /* * Intel host/guest exclude bits @@ -648,8 +651,10 @@ struct x86_perf_task_context { u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; int tos; + int valid_lbrs; int lbr_callstack_users; int lbr_stack_state; + int log_id; }; #define x86_add_quirk(func_) \ @@ -668,6 +673,7 @@ do { \ #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ +#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index c356098b6fb9..4d4015ddcf26 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -7,8 +7,6 @@ #ifndef _ASM_X86_MACH_DEFAULT_APM_H #define _ASM_X86_MACH_DEFAULT_APM_H -#include <asm/nospec-branch.h> - #ifdef APM_ZERO_SEGS # define APM_DO_ZERO_SEGS \ "pushl %%ds\n\t" \ @@ -34,7 +32,6 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ - firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -47,7 +44,6 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, "=S" (*esi) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); - firmware_restrict_branch_speculation_end(); } static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, @@ -60,7 +56,6 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, * N.B. We do NOT need a cld after the BIOS call * because we always save and restore the flags. */ - firmware_restrict_branch_speculation_start(); __asm__ __volatile__(APM_DO_ZERO_SEGS "pushl %%edi\n\t" "pushl %%ebp\n\t" @@ -73,7 +68,6 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, "=S" (si) : "a" (func), "b" (ebx_in), "c" (ecx_in) : "memory", "cc"); - firmware_restrict_branch_speculation_end(); return error; } diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 0db6bec95489..b143717b92b3 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -80,6 +80,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ +#define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); @@ -91,6 +92,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) * * Atomically increments @v by 1. */ +#define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" @@ -103,6 +105,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v) * * Atomically decrements @v by 1. */ +#define arch_atomic_dec arch_atomic_dec static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" @@ -117,6 +120,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ +#define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); @@ -130,6 +134,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ +#define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); @@ -144,6 +149,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ +#define arch_atomic_add_negative arch_atomic_add_negative static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); @@ -173,9 +179,6 @@ static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) return arch_atomic_add_return(-i, v); } -#define arch_atomic_inc_return(v) (arch_atomic_add_return(1, v)) -#define arch_atomic_dec_return(v) (arch_atomic_sub_return(1, v)) - static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); @@ -199,7 +202,7 @@ static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int n static inline int arch_atomic_xchg(atomic_t *v, int new) { - return xchg(&v->counter, new); + return arch_xchg(&v->counter, new); } static inline void arch_atomic_and(int i, atomic_t *v) @@ -253,27 +256,6 @@ static inline int arch_atomic_fetch_xor(int i, atomic_t *v) return val; } -/** - * __arch_atomic_add_unless - add unless the number is already a given value - * @v: pointer of type atomic_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as @v was not already @u. - * Returns the old value of @v. - */ -static __always_inline int __arch_atomic_add_unless(atomic_t *v, int a, int u) -{ - int c = arch_atomic_read(v); - - do { - if (unlikely(c == u)) - break; - } while (!arch_atomic_try_cmpxchg(v, &c, c + a)); - - return c; -} - #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 92212bf0484f..ef959f02d070 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -158,6 +158,7 @@ static inline long long arch_atomic64_inc_return(atomic64_t *v) "S" (v) : "memory", "ecx"); return a; } +#define arch_atomic64_inc_return arch_atomic64_inc_return static inline long long arch_atomic64_dec_return(atomic64_t *v) { @@ -166,6 +167,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v) "S" (v) : "memory", "ecx"); return a; } +#define arch_atomic64_dec_return arch_atomic64_dec_return /** * arch_atomic64_add - add integer to atomic64 variable @@ -198,25 +200,12 @@ static inline long long arch_atomic64_sub(long long i, atomic64_t *v) } /** - * arch_atomic64_sub_and_test - subtract value from variable and test result - * @i: integer value to subtract - * @v: pointer to type atomic64_t - * - * Atomically subtracts @i from @v and returns - * true if the result is zero, or false for all - * other cases. - */ -static inline int arch_atomic64_sub_and_test(long long i, atomic64_t *v) -{ - return arch_atomic64_sub_return(i, v) == 0; -} - -/** * arch_atomic64_inc - increment atomic64 variable * @v: pointer to type atomic64_t * * Atomically increments @v by 1. */ +#define arch_atomic64_inc arch_atomic64_inc static inline void arch_atomic64_inc(atomic64_t *v) { __alternative_atomic64(inc, inc_return, /* no output */, @@ -229,6 +218,7 @@ static inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ +#define arch_atomic64_dec arch_atomic64_dec static inline void arch_atomic64_dec(atomic64_t *v) { __alternative_atomic64(dec, dec_return, /* no output */, @@ -236,46 +226,6 @@ static inline void arch_atomic64_dec(atomic64_t *v) } /** - * arch_atomic64_dec_and_test - decrement and test - * @v: pointer to type atomic64_t - * - * Atomically decrements @v by 1 and - * returns true if the result is 0, or false for all other - * cases. - */ -static inline int arch_atomic64_dec_and_test(atomic64_t *v) -{ - return arch_atomic64_dec_return(v) == 0; -} - -/** - * atomic64_inc_and_test - increment and test - * @v: pointer to type atomic64_t - * - * Atomically increments @v by 1 - * and returns true if the result is zero, or false for all - * other cases. - */ -static inline int arch_atomic64_inc_and_test(atomic64_t *v) -{ - return arch_atomic64_inc_return(v) == 0; -} - -/** - * arch_atomic64_add_negative - add and test if negative - * @i: integer value to add - * @v: pointer to type atomic64_t - * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. - */ -static inline int arch_atomic64_add_negative(long long i, atomic64_t *v) -{ - return arch_atomic64_add_return(i, v) < 0; -} - -/** * arch_atomic64_add_unless - add unless the number is a given value * @v: pointer of type atomic64_t * @a: the amount to add to v... @@ -295,7 +245,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, return (int)a; } - +#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero static inline int arch_atomic64_inc_not_zero(atomic64_t *v) { int r; @@ -304,6 +254,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) return r; } +#define arch_atomic64_dec_if_positive arch_atomic64_dec_if_positive static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) { long long r; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 6106b59d3260..4343d9b4f30e 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -71,6 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ +#define arch_atomic64_sub_and_test arch_atomic64_sub_and_test static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); @@ -82,6 +83,7 @@ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) * * Atomically increments @v by 1. */ +#define arch_atomic64_inc arch_atomic64_inc static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" @@ -95,6 +97,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) * * Atomically decrements @v by 1. */ +#define arch_atomic64_dec arch_atomic64_dec static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" @@ -110,6 +113,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ +#define arch_atomic64_dec_and_test arch_atomic64_dec_and_test static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); @@ -123,6 +127,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ +#define arch_atomic64_inc_and_test arch_atomic64_inc_and_test static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); @@ -137,6 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ +#define arch_atomic64_add_negative arch_atomic64_add_negative static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); @@ -169,9 +175,6 @@ static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) return xadd(&v->counter, -i); } -#define arch_atomic64_inc_return(v) (arch_atomic64_add_return(1, (v))) -#define arch_atomic64_dec_return(v) (arch_atomic64_sub_return(1, (v))) - static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) { return arch_cmpxchg(&v->counter, old, new); @@ -185,46 +188,7 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, l static inline long arch_atomic64_xchg(atomic64_t *v, long new) { - return xchg(&v->counter, new); -} - -/** - * arch_atomic64_add_unless - add unless the number is a given value - * @v: pointer of type atomic64_t - * @a: the amount to add to v... - * @u: ...unless v is equal to u. - * - * Atomically adds @a to @v, so long as it was not @u. - * Returns the old value of @v. - */ -static inline bool arch_atomic64_add_unless(atomic64_t *v, long a, long u) -{ - s64 c = arch_atomic64_read(v); - do { - if (unlikely(c == u)) - return false; - } while (!arch_atomic64_try_cmpxchg(v, &c, c + a)); - return true; -} - -#define arch_atomic64_inc_not_zero(v) arch_atomic64_add_unless((v), 1, 0) - -/* - * arch_atomic64_dec_if_positive - decrement by 1 if old value positive - * @v: pointer of type atomic_t - * - * The function returns the old value of *v minus 1, even if - * the atomic variable, v, was not decremented. - */ -static inline long arch_atomic64_dec_if_positive(atomic64_t *v) -{ - s64 dec, c = arch_atomic64_read(v); - do { - dec = c - 1; - if (unlikely(dec < 0)) - break; - } while (!arch_atomic64_try_cmpxchg(v, &c, dec)); - return dec; + return arch_xchg(&v->counter, new); } static inline void arch_atomic64_and(long i, atomic64_t *v) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index e3efd8a06066..a55d79b233d3 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -75,7 +75,7 @@ extern void __add_wrong_size(void) * use "asm volatile" and "memory" clobbers to prevent gcc from moving * information around. */ -#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") +#define arch_xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") /* * Atomic compare and exchange. Compare OLD with MEM, if identical, diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index bfca3b346c74..072e5459fe2f 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -10,13 +10,13 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) #define arch_cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg((ptr), (o), (n)); \ + arch_cmpxchg((ptr), (o), (n)); \ }) #define arch_cmpxchg64_local(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg_local((ptr), (o), (n)); \ + arch_cmpxchg_local((ptr), (o), (n)); \ }) #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5701f5cecd31..7fff98fa5855 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -229,7 +229,7 @@ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - +#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index f59c39835a5a..a1f0e90d0818 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -49,11 +49,14 @@ static inline int hw_breakpoint_slots(int type) return HBP_NUM; } +struct perf_event_attr; struct perf_event; struct pmu; -extern int arch_check_bp_in_kernelspace(struct perf_event *bp); -extern int arch_validate_hwbkpt_settings(struct perf_event *bp); +extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); +extern int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, unsigned long val, void *data); diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index 62a9f4966b42..ae26df1c2789 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -8,6 +8,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 +#define MAX_FIXED_PEBS_EVENTS 3 /* * A debug store configuration. @@ -23,7 +24,7 @@ struct debug_store { u64 pebs_index; u64 pebs_absolute_maximum; u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; + u64 pebs_event_reset[MAX_PEBS_EVENTS + MAX_FIXED_PEBS_EVENTS]; } __aligned(PAGE_SIZE); DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 367d99cff426..c8cec1b39b88 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -78,7 +78,7 @@ struct arch_specific_insn { * boostable = true: This instruction has been boosted: we have * added a relative jump after the instruction copy in insn, * so no single-step and fixup are needed (unless there's - * a post_handler or break_handler). + * a post_handler). */ bool boostable; bool if_modifier; @@ -111,9 +111,6 @@ struct kprobe_ctlblk { unsigned long kprobe_status; unsigned long kprobe_old_flags; unsigned long kprobe_saved_flags; - unsigned long *jprobe_saved_sp; - struct pt_regs jprobe_saved_regs; - kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE]; struct prev_kprobe prev_kprobe; }; diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 9c9dc579bd7d..46f516dd80ce 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -88,6 +88,7 @@ struct orc_entry { unsigned sp_reg:4; unsigned bp_reg:4; unsigned type:2; + unsigned end:1; } __packed; /* @@ -101,6 +102,7 @@ struct unwind_hint { s16 sp_offset; u8 sp_reg; u8 type; + u8 end; }; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a06b07399d17..e9202a0de8f0 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -450,9 +450,10 @@ do { \ bool __ret; \ typeof(pcp1) __o1 = (o1), __n1 = (n1); \ typeof(pcp2) __o2 = (o2), __n2 = (n2); \ - asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ - : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \ - : "b" (__n1), "c" (__n2), "a" (__o1)); \ + asm volatile("cmpxchg8b "__percpu_arg(1) \ + CC_SET(z) \ + : CC_OUT(z) (__ret), "+m" (pcp1), "+m" (pcp2), "+a" (__o1), "+d" (__o2) \ + : "b" (__n1), "c" (__n2)); \ __ret; \ }) diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 9ef5ee03d2d7..159622ee0674 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -43,7 +43,7 @@ asm (".pushsection .text;" "push %rdx;" "mov $0x1,%eax;" "xor %edx,%edx;" - "lock cmpxchg %dl,(%rdi);" + LOCK_PREFIX "cmpxchg %dl,(%rdi);" "cmp $0x1,%al;" "jne .slowpath;" "pop %rdx;" diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index 4cf11d88d3b3..19b90521954c 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -5,6 +5,7 @@ * PaX/grsecurity. */ #include <linux/refcount.h> +#include <asm/bug.h> /* * This is the first portion of the refcount error handling, which lives in diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 62acb613114b..a9d637bc301d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -52,7 +52,12 @@ copy_to_user_mcsafe(void *to, const void *from, unsigned len) unsigned long ret; __uaccess_begin(); - ret = memcpy_mcsafe(to, from, len); + /* + * Note, __memcpy_mcsafe() is explicitly used since it can + * handle exceptions / faults. memcpy_mcsafe() may fall back to + * memcpy() which lacks this handling. + */ + ret = __memcpy_mcsafe(to, from, len); __uaccess_end(); return ret; } diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index bae46fc6b9de..0bcdb1279361 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -26,7 +26,7 @@ * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL end=0 #ifdef CONFIG_STACK_VALIDATION .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints @@ -35,12 +35,14 @@ .short \sp_offset .byte \sp_reg .byte \type + .byte \end + .balign 4 .popsection #endif .endm .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED end=1 .endm .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 @@ -86,19 +88,21 @@ #else /* !__ASSEMBLY__ */ -#define UNWIND_HINT(sp_reg, sp_offset, type) \ +#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ - ".short " __stringify(sp_offset) "\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ ".byte " __stringify(sp_reg) "\n\t" \ ".byte " __stringify(type) "\n\t" \ + ".byte " __stringify(end) "\n\t" \ + ".balign 4 \n\t" \ ".popsection\n\t" -#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0) -#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2aabd4cb0e3f..07fa222f0c52 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -573,6 +573,9 @@ static u32 skx_deadline_rev(void) case 0x04: return 0x02000014; } + if (boot_cpu_data.x86_stepping > 4) + return 0; + return ~0U; } @@ -937,7 +940,7 @@ static int __init calibrate_APIC_clock(void) if (levt->features & CLOCK_EVT_FEAT_DUMMY) { pr_warning("APIC timer disabled due to verification failure\n"); - return -1; + return -1; } return 0; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 35aaee4fc028..0954315842c0 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -218,7 +218,8 @@ static int reserve_irq_vector(struct irq_data *irqd) return 0; } -static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) +static int +assign_vector_locked(struct irq_data *irqd, const struct cpumask *dest) { struct apic_chip_data *apicd = apic_chip_data(irqd); bool resvd = apicd->has_reserved; @@ -245,22 +246,12 @@ static int allocate_vector(struct irq_data *irqd, const struct cpumask *dest) return -EBUSY; vector = irq_matrix_alloc(vector_matrix, dest, resvd, &cpu); - if (vector > 0) - apic_update_vector(irqd, vector, cpu); trace_vector_alloc(irqd->irq, vector, resvd, vector); - return vector; -} - -static int assign_vector_locked(struct irq_data *irqd, - const struct cpumask *dest) -{ - struct apic_chip_data *apicd = apic_chip_data(irqd); - int vector = allocate_vector(irqd, dest); - if (vector < 0) return vector; + apic_update_vector(irqd, vector, cpu); + apic_update_irq_cfg(irqd, vector, cpu); - apic_update_irq_cfg(irqd, apicd->vector, apicd->cpu); return 0; } @@ -433,7 +424,7 @@ static int activate_managed(struct irq_data *irqd) pr_err("Managed startup irq %u, no vector available\n", irqd->irq); } - return ret; + return ret; } static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d0de79fdab0..ec00d1ff5098 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -240,6 +240,7 @@ #include <asm/olpc.h> #include <asm/paravirt.h> #include <asm/reboot.h> +#include <asm/nospec-branch.h> #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); @@ -614,11 +615,13 @@ static long __apm_bios_call(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; apm_bios_call_asm(call->func, call->ebx, call->ecx, &call->eax, &call->ebx, &call->ecx, &call->edx, &call->esi); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); @@ -690,10 +693,12 @@ static long __apm_bios_call_simple(void *_call) gdt[0x40 / 8] = bad_bios_desc; apm_irq_save(flags); + firmware_restrict_branch_speculation_start(); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, &call->eax); APM_DO_RESTORE_SEGS; + firmware_restrict_branch_speculation_end(); apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index eb75564f2d25..c050cd6066af 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -465,14 +465,17 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) #define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 #define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 #define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 +#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + u32 msr_vpid_cap, msr_ept_cap; clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); clear_cpu_cap(c, X86_FEATURE_VNMI); clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); clear_cpu_cap(c, X86_FEATURE_EPT); clear_cpu_cap(c, X86_FEATURE_VPID); + clear_cpu_cap(c, X86_FEATURE_EPT_AD); rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); msr_ctl = vmx_msr_high | vmx_msr_low; @@ -487,8 +490,13 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c) if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); - if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) { set_cpu_cap(c, X86_FEATURE_EPT); + rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, + msr_ept_cap, msr_vpid_cap); + if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD) + set_cpu_cap(c, X86_FEATURE_EPT_AD); + } if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) set_cpu_cap(c, X86_FEATURE_VPID); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c102ad51025e..4b767284b7f5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -123,8 +123,8 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - /* We hope get_seconds stays lockless */ - m->time = get_seconds(); + /* need the internal __ version to avoid deadlocks */ + m->time = __ktime_get_real_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; @@ -1104,6 +1104,101 @@ static void mce_unmap_kpfn(unsigned long pfn) } #endif + +/* + * Cases where we avoid rendezvous handler timeout: + * 1) If this CPU is offline. + * + * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to + * skip those CPUs which remain looping in the 1st kernel - see + * crash_nmi_callback(). + * + * Note: there still is a small window between kexec-ing and the new, + * kdump kernel establishing a new #MC handler where a broadcasted MCE + * might not get handled properly. + */ +static bool __mc_check_crashing_cpu(int cpu) +{ + if (cpu_is_offline(cpu) || + (crashing_cpu != -1 && crashing_cpu != cpu)) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return true; + } + } + return false; +} + +static void __mc_scan_banks(struct mce *m, struct mce *final, + unsigned long *toclear, unsigned long *valid_banks, + int no_way_out, int *worst) +{ + struct mca_config *cfg = &mca_cfg; + int severity, i; + + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + + if (!mce_banks[i].ctl) + continue; + + m->misc = 0; + m->addr = 0; + m->bank = i; + + m->status = mce_rdmsrl(msr_ops.status(i)); + if (!(m->status & MCI_STATUS_VAL)) + continue; + + /* + * Corrected or non-signaled errors are handled by + * machine_check_poll(). Leave them alone, unless this panics. + */ + if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* Set taint even when machine check was not enabled. */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(m, cfg->tolerant, NULL, true); + + /* + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicking. + */ + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) + continue; + + __set_bit(i, toclear); + + /* Machine check event was not enabled. Clear, but ignore. */ + if (severity == MCE_NO_SEVERITY) + continue; + + mce_read_aux(m, i); + + /* assuming valid severity level != 0 */ + m->severity = severity; + + mce_log(m); + + if (severity > *worst) { + *final = *m; + *worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + *m = *final; +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1118,68 +1213,45 @@ static void mce_unmap_kpfn(unsigned long pfn) */ void do_machine_check(struct pt_regs *regs, long error_code) { + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; + int cpu = smp_processor_id(); + char *msg = "Unknown"; struct mce m, *final; - int i; int worst = 0; - int severity; /* * Establish sequential order between the CPUs entering the machine * check handler. */ int order = -1; + /* * If no_way_out gets set, there is no safe way to recover from this * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; + /* * If kill_it gets set, there might be a way to recover from this * error. */ int kill_it = 0; - DECLARE_BITMAP(toclear, MAX_NR_BANKS); - DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); - char *msg = "Unknown"; /* * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES * on Intel. */ int lmce = 1; - int cpu = smp_processor_id(); - - /* - * Cases where we avoid rendezvous handler timeout: - * 1) If this CPU is offline. - * - * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to - * skip those CPUs which remain looping in the 1st kernel - see - * crash_nmi_callback(). - * - * Note: there still is a small window between kexec-ing and the new, - * kdump kernel establishing a new #MC handler where a broadcasted MCE - * might not get handled properly. - */ - if (cpu_is_offline(cpu) || - (crashing_cpu != -1 && crashing_cpu != cpu)) { - u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); - return; - } - } + if (__mc_check_crashing_cpu(cpu)) + return; ist_enter(regs); this_cpu_inc(mce_exception_count); - if (!cfg->banks) - goto out; - mce_gather_info(&m, regs); m.tsc = rdtsc(); @@ -1220,67 +1292,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); } - for (i = 0; i < cfg->banks; i++) { - __clear_bit(i, toclear); - if (!test_bit(i, valid_banks)) - continue; - if (!mce_banks[i].ctl) - continue; - - m.misc = 0; - m.addr = 0; - m.bank = i; - - m.status = mce_rdmsrl(msr_ops.status(i)); - if ((m.status & MCI_STATUS_VAL) == 0) - continue; - - /* - * Non uncorrected or non signaled errors are handled by - * machine_check_poll. Leave them alone, unless this panics. - */ - if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && - !no_way_out) - continue; - - /* - * Set taint even when machine check was not enabled. - */ - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - severity = mce_severity(&m, cfg->tolerant, NULL, true); - - /* - * When machine check was for corrected/deferred handler don't - * touch, unless we're panicing. - */ - if ((severity == MCE_KEEP_SEVERITY || - severity == MCE_UCNA_SEVERITY) && !no_way_out) - continue; - __set_bit(i, toclear); - if (severity == MCE_NO_SEVERITY) { - /* - * Machine check event was not enabled. Clear, but - * ignore. - */ - continue; - } - - mce_read_aux(&m, i); - - /* assuming valid severity level != 0 */ - m.severity = severity; - - mce_log(&m); - - if (severity > worst) { - *final = m; - worst = severity; - } - } - - /* mce_clear_state will clear *final, save locally for use later */ - m = *final; + __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1319,7 +1331,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); -out: + sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) @@ -2165,9 +2177,6 @@ static ssize_t store_int_with_restart(struct device *s, if (check_interval == old_check_interval) return ret; - if (check_interval < 1) - check_interval = 1; - mutex_lock(&mce_sysfs_mutex); mce_restart(); mutex_unlock(&mce_sysfs_mutex); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 8344dd2f310a..15ebc2fc166e 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -235,7 +235,7 @@ ENTRY(secondary_startup_64) * address given in m16:64. */ pushq $.Lafter_lret # put return address on stack for unwinder - xorq %rbp, %rbp # clear frame pointer + xorl %ebp, %ebp # clear frame pointer movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 8771766d46b6..34a5c1715148 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -169,28 +169,29 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) set_dr_addr_mask(0, i); } -/* - * Check for virtual address in kernel space. - */ -int arch_check_bp_in_kernelspace(struct perf_event *bp) +static int arch_bp_generic_len(int x86_len) { - unsigned int len; - unsigned long va; - struct arch_hw_breakpoint *info = counter_arch_bp(bp); - - va = info->address; - len = bp->attr.bp_len; - - /* - * We don't need to worry about va + len - 1 overflowing: - * we already require that va is aligned to a multiple of len. - */ - return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + return HW_BREAKPOINT_LEN_1; + case X86_BREAKPOINT_LEN_2: + return HW_BREAKPOINT_LEN_2; + case X86_BREAKPOINT_LEN_4: + return HW_BREAKPOINT_LEN_4; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + return HW_BREAKPOINT_LEN_8; +#endif + default: + return -EINVAL; + } } int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { + int len; + /* Type */ switch (x86_type) { case X86_BREAKPOINT_EXECUTE: @@ -211,42 +212,47 @@ int arch_bp_generic_fields(int x86_len, int x86_type, } /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_1: - *gen_len = HW_BREAKPOINT_LEN_1; - break; - case X86_BREAKPOINT_LEN_2: - *gen_len = HW_BREAKPOINT_LEN_2; - break; - case X86_BREAKPOINT_LEN_4: - *gen_len = HW_BREAKPOINT_LEN_4; - break; -#ifdef CONFIG_X86_64 - case X86_BREAKPOINT_LEN_8: - *gen_len = HW_BREAKPOINT_LEN_8; - break; -#endif - default: + len = arch_bp_generic_len(x86_len); + if (len < 0) return -EINVAL; - } + *gen_len = len; return 0; } - -static int arch_build_bp_info(struct perf_event *bp) +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long va; + int len; - info->address = bp->attr.bp_addr; + va = hw->address; + len = arch_bp_generic_len(hw->len); + WARN_ON_ONCE(len < 0); + + /* + * We don't need to worry about va + len - 1 overflowing: + * we already require that va is aligned to a multiple of len. + */ + return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); +} + +static int arch_build_bp_info(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) +{ + hw->address = attr->bp_addr; + hw->mask = 0; /* Type */ - switch (bp->attr.bp_type) { + switch (attr->bp_type) { case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; + hw->type = X86_BREAKPOINT_WRITE; break; case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; + hw->type = X86_BREAKPOINT_RW; break; case HW_BREAKPOINT_X: /* @@ -254,23 +260,23 @@ static int arch_build_bp_info(struct perf_event *bp) * acceptable for kprobes. On non-kprobes kernels, we don't * allow kernel breakpoints at all. */ - if (bp->attr.bp_addr >= TASK_SIZE_MAX) { + if (attr->bp_addr >= TASK_SIZE_MAX) { #ifdef CONFIG_KPROBES - if (within_kprobe_blacklist(bp->attr.bp_addr)) + if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; #else return -EINVAL; #endif } - info->type = X86_BREAKPOINT_EXECUTE; + hw->type = X86_BREAKPOINT_EXECUTE; /* * x86 inst breakpoints need to have a specific undefined len. * But we still need to check userspace is not trying to setup * an unsupported length, to get a range breakpoint for example. */ - if (bp->attr.bp_len == sizeof(long)) { - info->len = X86_BREAKPOINT_LEN_X; + if (attr->bp_len == sizeof(long)) { + hw->len = X86_BREAKPOINT_LEN_X; return 0; } default: @@ -278,28 +284,26 @@ static int arch_build_bp_info(struct perf_event *bp) } /* Len */ - info->mask = 0; - - switch (bp->attr.bp_len) { + switch (attr->bp_len) { case HW_BREAKPOINT_LEN_1: - info->len = X86_BREAKPOINT_LEN_1; + hw->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - info->len = X86_BREAKPOINT_LEN_2; + hw->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - info->len = X86_BREAKPOINT_LEN_4; + hw->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: - info->len = X86_BREAKPOINT_LEN_8; + hw->len = X86_BREAKPOINT_LEN_8; break; #endif default: /* AMD range breakpoint */ - if (!is_power_of_2(bp->attr.bp_len)) + if (!is_power_of_2(attr->bp_len)) return -EINVAL; - if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) + if (attr->bp_addr & (attr->bp_len - 1)) return -EINVAL; if (!boot_cpu_has(X86_FEATURE_BPEXT)) @@ -312,8 +316,8 @@ static int arch_build_bp_info(struct perf_event *bp) * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ - info->mask = bp->attr.bp_len - 1; - info->len = X86_BREAKPOINT_LEN_1; + hw->mask = attr->bp_len - 1; + hw->len = X86_BREAKPOINT_LEN_1; } return 0; @@ -322,22 +326,23 @@ static int arch_build_bp_info(struct perf_event *bp) /* * Validate the arch-specific HW Breakpoint register settings */ -int arch_validate_hwbkpt_settings(struct perf_event *bp) +int hw_breakpoint_arch_parse(struct perf_event *bp, + const struct perf_event_attr *attr, + struct arch_hw_breakpoint *hw) { - struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned int align; int ret; - ret = arch_build_bp_info(bp); + ret = arch_build_bp_info(bp, attr, hw); if (ret) return ret; - switch (info->len) { + switch (hw->len) { case X86_BREAKPOINT_LEN_1: align = 0; - if (info->mask) - align = info->mask; + if (hw->mask) + align = hw->mask; break; case X86_BREAKPOINT_LEN_2: align = 1; @@ -358,7 +363,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (info->address & align) + if (hw->address & align) return -EINVAL; return 0; diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index ae38dccf0c8f..2b949f4fd4d8 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -105,14 +105,4 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig } #endif -#ifdef CONFIG_KPROBES_ON_FTRACE -extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb); -#else -static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - return 0; -} -#endif #endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6f4d42377fe5..b0d1e81c96bb 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -66,8 +66,6 @@ #include "common.h" -void jprobe_return_end(void); - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -395,8 +393,6 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn) - (u8 *) real; if ((s64) (s32) newdisp != newdisp) { pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", - src, real, insn->displacement.value); return 0; } disp = (u8 *) dest + insn_offset_displacement(insn); @@ -596,7 +592,6 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, * stepping. */ regs->ip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); return; } #endif @@ -640,8 +635,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, * Raise a BUG or we'll continue in an endless reentering loop * and eventually a stack overflow. */ - printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", - p->addr); + pr_err("Unrecoverable kprobe detected.\n"); dump_kprobe(p); BUG(); default: @@ -669,12 +663,10 @@ int kprobe_int3_handler(struct pt_regs *regs) addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); /* - * We don't want to be preempted for the entire - * duration of kprobe processing. We conditionally - * re-enable preemption at the end of this function, - * and also in reenter_kprobe() and setup_singlestep(). + * We don't want to be preempted for the entire duration of kprobe + * processing. Since int3 and debug trap disables irqs and we clear + * IF while singlestepping, it must be no preemptible. */ - preempt_disable(); kcb = get_kprobe_ctlblk(); p = get_kprobe(addr); @@ -690,13 +682,14 @@ int kprobe_int3_handler(struct pt_regs *regs) /* * If we have no pre-handler or it returned 0, we * continue with normal processing. If we have a - * pre-handler and it returned non-zero, it prepped - * for calling the break_handler below on re-entry - * for jprobe processing, so get out doing nothing - * more here. + * pre-handler and it returned non-zero, that means + * user handler setup registers to exit to another + * instruction, we must skip the single stepping. */ if (!p->pre_handler || !p->pre_handler(p, regs)) setup_singlestep(p, regs, kcb, 0); + else + reset_current_kprobe(); return 1; } } else if (*addr != BREAKPOINT_INSTRUCTION) { @@ -710,18 +703,9 @@ int kprobe_int3_handler(struct pt_regs *regs) * the original instruction. */ regs->ip = (unsigned long)addr; - preempt_enable_no_resched(); return 1; - } else if (kprobe_running()) { - p = __this_cpu_read(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - if (!skip_singlestep(p, regs, kcb)) - setup_singlestep(p, regs, kcb, 0); - return 1; - } } /* else: not a kprobe fault; let the kernel handle it */ - preempt_enable_no_resched(); return 0; } NOKPROBE_SYMBOL(kprobe_int3_handler); @@ -972,8 +956,6 @@ int kprobe_debug_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); - /* * if somebody else is singlestepping across a probe point, flags * will have TF set, in which case, continue the remaining processing @@ -1020,7 +1002,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || kcb->kprobe_status == KPROBE_HIT_SSDONE) { /* @@ -1083,93 +1064,6 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, } NOKPROBE_SYMBOL(kprobe_exceptions_notify); -int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_sp = stack_addr(regs); - addr = (unsigned long)(kcb->jprobe_saved_sp); - - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy - * raw stack chunk with redzones: - */ - __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); - regs->ip = (unsigned long)(jp->entry); - - /* - * jprobes use jprobe_return() which skips the normal return - * path of the function, and this messes up the accounting of the - * function graph tracer to get messed up. - * - * Pause function graph tracing while performing the jprobe function. - */ - pause_graph_tracing(); - return 1; -} -NOKPROBE_SYMBOL(setjmp_pre_handler); - -void jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - /* Unpoison stack redzones in the frames we are going to jump over. */ - kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp); - - asm volatile ( -#ifdef CONFIG_X86_64 - " xchg %%rbx,%%rsp \n" -#else - " xchgl %%ebx,%%esp \n" -#endif - " int3 \n" - " .globl jprobe_return_end\n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_sp):"memory"); -} -NOKPROBE_SYMBOL(jprobe_return); -NOKPROBE_SYMBOL(jprobe_return_end); - -int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->ip - 1); - struct jprobe *jp = container_of(p, struct jprobe, kp); - void *saved_sp = kcb->jprobe_saved_sp; - - if ((addr > (u8 *) jprobe_return) && - (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != saved_sp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk(KERN_ERR - "current sp %p does not match saved sp %p\n", - stack_addr(regs), saved_sp); - printk(KERN_ERR "Saved registers for jprobe %p\n", jp); - show_regs(saved_regs); - printk(KERN_ERR "Current registers\n"); - show_regs(regs); - BUG(); - } - /* It's OK to start function graph tracing again */ - unpause_graph_tracing(); - *regs = kcb->jprobe_saved_regs; - __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(longjmp_break_handler); - bool arch_within_kprobe_blacklist(unsigned long addr) { bool is_in_entry_trampoline_section = false; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 8dc0161cec8f..ef819e19650b 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,36 +25,6 @@ #include "common.h" -static nokprobe_inline -void __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb, unsigned long orig_ip) -{ - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; - if (unlikely(p->post_handler)) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); - if (orig_ip) - regs->ip = orig_ip; -} - -int skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - if (kprobe_ftrace(p)) { - __skip_singlestep(p, regs, kcb, 0); - preempt_enable_no_resched(); - return 1; - } - return 0; -} -NOKPROBE_SYMBOL(skip_singlestep); - /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) @@ -75,18 +45,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); - /* To emulate trap based kprobes, preempt_disable here */ - preempt_disable(); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - __skip_singlestep(p, regs, kcb, orig_ip); - preempt_enable_no_resched(); + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + regs->ip = orig_ip; } /* - * If pre_handler returns !0, it sets regs->ip and - * resets current kprobe, and keep preempt count +1. + * If pre_handler returns !0, it changes regs->ip. We have to + * skip emulating post_handler. */ + __this_cpu_write(current_kprobe, NULL); } } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 203d398802a3..eaf02f2e7300 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -491,7 +491,6 @@ int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; if (!reenter) reset_current_kprobe(); - preempt_enable_no_resched(); return 1; } return 0; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..a37bda38d205 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) for (;;) { if (!n.halted) - prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) if (n->halted) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) - swake_up(&n->wq); + swake_up_one(&n->wq); } static void apf_task_wake_all(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index bf8d1eb7fca3..3b8e7c13c614 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -138,6 +138,7 @@ static unsigned long kvm_get_tsc_khz(void) src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); put_cpu(); + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); return tsc_khz; } @@ -319,6 +320,8 @@ void __init kvmclock_init(void) printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + pvclock_set_pvti_cpu0_va(hv_clock); + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); @@ -366,14 +369,11 @@ int __init kvm_setup_vsyscall_timeinfo(void) vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - put_cpu(); - return 1; - } - - pvclock_set_pvti_cpu0_va(hv_clock); put_cpu(); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 1; + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif return 0; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 9edadabf04f6..9cb98f7b07c9 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -20,7 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); -DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 4dfd90a75e63..2e9006c1e240 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -60,7 +60,7 @@ void __init check_iommu_entries(struct iommu_table_entry *start, printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", p->detect, q->detect); /* Heavy handed way..*/ - x->depend = 0; + x->depend = NULL; } } diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index da5190a1ea16..4a710ffffd9a 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -9,6 +9,6 @@ static __init int add_pcspkr(void) pd = platform_device_register_simple("pcspkr", -1, NULL, 0); - return IS_ERR(pd) ? PTR_ERR(pd) : 0; + return PTR_ERR_OR_ZERO(pd); } device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 093f2ea5dd56..7627455047c2 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -81,16 +81,6 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE -#define STACKTRACE_DUMP_ONCE(task) ({ \ - static bool __section(.data.unlikely) __dumped; \ - \ - if (!__dumped) { \ - __dumped = true; \ - WARN_ON(1); \ - show_stack(task, NULL); \ - } \ -}) - static int __always_inline __save_stack_trace_reliable(struct stack_trace *trace, struct task_struct *task) @@ -99,30 +89,25 @@ __save_stack_trace_reliable(struct stack_trace *trace, struct pt_regs *regs; unsigned long addr; - for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state); + for (unwind_start(&state, task, NULL, NULL); + !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { regs = unwind_get_entry_regs(&state, NULL); if (regs) { + /* Success path for user tasks */ + if (user_mode(regs)) + goto success; + /* * Kernel mode registers on the stack indicate an * in-kernel interrupt or exception (e.g., preemption * or a page fault), which can make frame pointers * unreliable. */ - if (!user_mode(regs)) - return -EINVAL; - /* - * The last frame contains the user mode syscall - * pt_regs. Skip it and finish the unwind. - */ - unwind_next_frame(&state); - if (!unwind_done(&state)) { - STACKTRACE_DUMP_ONCE(task); + if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; - } - break; } addr = unwind_get_return_address(&state); @@ -132,21 +117,22 @@ __save_stack_trace_reliable(struct stack_trace *trace, * generated code which __kernel_text_address() doesn't know * about. */ - if (!addr) { - STACKTRACE_DUMP_ONCE(task); + if (!addr) return -EINVAL; - } if (save_stack_address(trace, addr, false)) return -EINVAL; } /* Check for stack corruption */ - if (unwind_error(&state)) { - STACKTRACE_DUMP_ONCE(task); + if (unwind_error(&state)) + return -EINVAL; + + /* Success path for non-user tasks, i.e. kthreads and idle tasks */ + if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; - } +success: if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index feb28fee6cea..26038eacf74a 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -198,7 +198,7 @@ static int orc_sort_cmp(const void *_a, const void *_b) * whitelisted .o files which didn't get objtool generation. */ orc_a = cur_orc_table + (a - cur_orc_ip_table); - return orc_a->sp_reg == ORC_REG_UNDEFINED ? -1 : 1; + return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1; } #ifdef CONFIG_MODULES @@ -352,7 +352,7 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr bool unwind_next_frame(struct unwind_state *state) { - unsigned long ip_p, sp, orig_ip, prev_sp = state->sp; + unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; bool indirect = false; @@ -363,9 +363,9 @@ bool unwind_next_frame(struct unwind_state *state) /* Don't let modules unload while we're reading their ORC data. */ preempt_disable(); - /* Have we reached the end? */ + /* End-of-stack check for user tasks: */ if (state->regs && user_mode(state->regs)) - goto done; + goto the_end; /* * Find the orc_entry associated with the text address. @@ -374,9 +374,16 @@ bool unwind_next_frame(struct unwind_state *state) * calls and calls to noreturn functions. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); - if (!orc || orc->sp_reg == ORC_REG_UNDEFINED) - goto done; - orig_ip = state->ip; + if (!orc) + goto err; + + /* End-of-stack check for kernel threads: */ + if (orc->sp_reg == ORC_REG_UNDEFINED) { + if (!orc->end) + goto err; + + goto the_end; + } /* Find the previous frame's stack: */ switch (orc->sp_reg) { @@ -402,7 +409,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg R10 at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->r10; break; @@ -411,7 +418,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg R13 at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->r13; break; @@ -420,7 +427,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg DI at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->di; break; @@ -429,7 +436,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!state->regs || !state->full_regs) { orc_warn("missing regs for base reg DX at ip %pB\n", (void *)state->ip); - goto done; + goto err; } sp = state->regs->dx; break; @@ -437,12 +444,12 @@ bool unwind_next_frame(struct unwind_state *state) default: orc_warn("unknown SP base reg %d for ip %pB\n", orc->sp_reg, (void *)state->ip); - goto done; + goto err; } if (indirect) { if (!deref_stack_reg(state, sp, &sp)) - goto done; + goto err; } /* Find IP, SP and possibly regs: */ @@ -451,7 +458,7 @@ bool unwind_next_frame(struct unwind_state *state) ip_p = sp - sizeof(long); if (!deref_stack_reg(state, ip_p, &state->ip)) - goto done; + goto err; state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, state->ip, (void *)ip_p); @@ -465,7 +472,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); - goto done; + goto err; } state->regs = (struct pt_regs *)sp; @@ -477,7 +484,7 @@ bool unwind_next_frame(struct unwind_state *state) if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) { orc_warn("can't dereference iret registers at %p for ip %pB\n", (void *)sp, (void *)orig_ip); - goto done; + goto err; } state->regs = (void *)sp - IRET_FRAME_OFFSET; @@ -500,18 +507,18 @@ bool unwind_next_frame(struct unwind_state *state) case ORC_REG_PREV_SP: if (!deref_stack_reg(state, sp + orc->bp_offset, &state->bp)) - goto done; + goto err; break; case ORC_REG_BP: if (!deref_stack_reg(state, state->bp + orc->bp_offset, &state->bp)) - goto done; + goto err; break; default: orc_warn("unknown BP base reg %d for ip %pB\n", orc->bp_reg, (void *)orig_ip); - goto done; + goto err; } /* Prevent a recursive loop due to bad ORC data: */ @@ -520,13 +527,16 @@ bool unwind_next_frame(struct unwind_state *state) state->sp <= prev_sp) { orc_warn("stack going in the wrong direction? ip=%pB\n", (void *)orig_ip); - goto done; + goto err; } preempt_enable(); return true; -done: +err: + state->error = true; + +the_end: preempt_enable(); state->stack_info.type = STACK_TYPE_UNKNOWN; return false; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 92fd433c50b9..1bbec387d289 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -85,7 +85,7 @@ config KVM_AMD_SEV def_bool y bool "AMD Secure Encrypted Virtualization (SEV) support" depends on KVM_AMD && X86_64 - depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP + depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) ---help--- Provides support for launching Encrypted VMs on AMD processors. diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5cd8465d44f..d536d457517b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic) * using swait_active() is safe. */ if (swait_active(q)) - swake_up(q); + swake_up_one(q); if (apic_lvtt_tscdeadline(apic)) ktimer->expired_tscdeadline = ktimer->tscdeadline; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d594690d8b95..6b8f11521c41 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -890,7 +890,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) return -ENOMEM; cache->objects[cache->nobjs++] = page; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1689f433f3a0..5d8e317c2b04 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2571,6 +2571,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); + unsigned long fs_base, kernel_gs_base; #endif int i; @@ -2586,12 +2587,20 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; #ifdef CONFIG_X86_64 - save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; -#else - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); + if (likely(is_64bit_mm(current->mm))) { + save_fsgs_for_kvm(); + vmx->host_state.fs_sel = current->thread.fsindex; + vmx->host_state.gs_sel = current->thread.gsindex; + fs_base = current->thread.fsbase; + kernel_gs_base = current->thread.gsbase; + } else { +#endif + savesegment(fs, vmx->host_state.fs_sel); + savesegment(gs, vmx->host_state.gs_sel); +#ifdef CONFIG_X86_64 + fs_base = read_msr(MSR_FS_BASE); + kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + } #endif if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); @@ -2611,10 +2620,10 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) savesegment(ds, vmx->host_state.ds_sel); savesegment(es, vmx->host_state.es_sel); - vmcs_writel(HOST_FS_BASE, current->thread.fsbase); + vmcs_writel(HOST_FS_BASE, fs_base); vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vmx->msr_host_kernel_gs_base = kernel_gs_base; if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else @@ -4322,11 +4331,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmcs_conf->order = get_order(vmcs_conf->size); vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - /* KVM supports Enlightened VMCS v1 only */ - if (static_branch_unlikely(&enable_evmcs)) - vmcs_conf->revision_id = KVM_EVMCS_VERSION; - else - vmcs_conf->revision_id = vmx_msr_low; + vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; @@ -4396,7 +4401,13 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) return NULL; vmcs = page_address(pages); memset(vmcs, 0, vmcs_config.size); - vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ + + /* KVM supports Enlightened VMCS v1 only */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->revision_id = KVM_EVMCS_VERSION; + else + vmcs->revision_id = vmcs_config.revision_id; + return vmcs; } @@ -4564,6 +4575,19 @@ static __init int alloc_kvm_area(void) return -ENOMEM; } + /* + * When eVMCS is enabled, alloc_vmcs_cpu() sets + * vmcs->revision_id to KVM_EVMCS_VERSION instead of + * revision_id reported by MSR_IA32_VMX_BASIC. + * + * However, even though not explictly documented by + * TLFS, VMXArea passed as VMXON argument should + * still be marked with revision_id reported by + * physical CPU. + */ + if (static_branch_unlikely(&enable_evmcs)) + vmcs->revision_id = vmcs_config.revision_id; + per_cpu(vmxarea, cpu) = vmcs; } return 0; @@ -7869,6 +7893,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + vmx->nested.vpid02 = allocate_vpid(); + vmx->nested.vmxon = true; return 0; @@ -8456,21 +8482,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - gva_t vmcs_gva; + unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; + gva_t gva; if (!nested_vmx_check_permission(vcpu)) return 1; - if (get_vmx_mem_address(vcpu, exit_qualification, - vmx_instruction_info, true, &vmcs_gva)) + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ - if (kvm_write_guest_virt_system(vcpu, vmcs_gva, - (void *)&to_vmx(vcpu)->nested.current_vmptr, - sizeof(u64), &e)) { + if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, + sizeof(gpa_t), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } @@ -10346,11 +10371,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vmcs; } - if (nested) { + if (nested) nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, kvm_vcpu_apicv_active(&vmx->vcpu)); - vmx->nested.vpid02 = allocate_vpid(); - } vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -10367,7 +10390,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return &vmx->vcpu; free_vmcs: - free_vpid(vmx->nested.vpid02); free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); @@ -11753,7 +11775,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 msr_entry_idx; u32 exit_qual; int r; @@ -11775,10 +11796,10 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) nested_get_vmcs12_pages(vcpu, vmcs12); r = EXIT_REASON_MSR_LOAD_FAIL; - msr_entry_idx = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (msr_entry_idx) + exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (exit_qual) goto fail; /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0046aa70205a..2b812b3c5088 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1097,6 +1097,7 @@ static u32 msr_based_features[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, }; static unsigned int num_msr_based_features; @@ -1105,7 +1106,8 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { switch (msr->index) { case MSR_IA32_UCODE_REV: - rdmsrl(msr->index, msr->data); + case MSR_IA32_ARCH_CAPABILITIES: + rdmsrl_safe(msr->index, &msr->data); break; default: if (kvm_x86_ops->get_msr_feature(msr)) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 298ef1479240..3b24dc05251c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -256,7 +256,7 @@ ENTRY(__memcpy_mcsafe) /* Copy successful. Return zero */ .L_done_memcpy_trap: - xorq %rax, %rax + xorl %eax, %eax ret ENDPROC(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index 34a2a3bfde9c..b54d52a2d00a 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -61,7 +61,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = nid; + emu_nid_to_phys[nid] = pb->nid; pb->start += size; if (pb->start >= pb->end) { @@ -198,40 +198,73 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) return end; } +static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) +{ + unsigned long max_pfn = PHYS_PFN(max_addr); + unsigned long base_pfn = PHYS_PFN(base); + unsigned long hole_pfns = PHYS_PFN(hole); + + return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); +} + /* * Sets up fake nodes of `size' interleaved over physical nodes ranging from * `addr' to `max_addr'. * * Returns zero on success or negative on error. */ -static int __init split_nodes_size_interleave(struct numa_meminfo *ei, +static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, struct numa_meminfo *pi, - u64 addr, u64 max_addr, u64 size) + u64 addr, u64 max_addr, u64 size, + int nr_nodes, struct numa_memblk *pblk, + int nid) { nodemask_t physnode_mask = numa_nodes_parsed; + int i, ret, uniform = 0; u64 min_size; - int nid = 0; - int i, ret; - if (!size) + if ((!size && !nr_nodes) || (nr_nodes && !pblk)) return -1; + /* - * The limit on emulated nodes is MAX_NUMNODES, so the size per node is - * increased accordingly if the requested size is too small. This - * creates a uniform distribution of node sizes across the entire - * machine (but not necessarily over physical nodes). + * In the 'uniform' case split the passed in physical node by + * nr_nodes, in the non-uniform case, ignore the passed in + * physical block and try to create nodes of at least size + * @size. + * + * In the uniform case, split the nodes strictly by physical + * capacity, i.e. ignore holes. In the non-uniform case account + * for holes and treat @size as a minimum floor. */ - min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; - min_size = max(min_size, FAKE_NODE_MIN_SIZE); - if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) - min_size = (min_size + FAKE_NODE_MIN_SIZE) & - FAKE_NODE_MIN_HASH_MASK; + if (!nr_nodes) + nr_nodes = MAX_NUMNODES; + else { + nodes_clear(physnode_mask); + node_set(pblk->nid, physnode_mask); + uniform = 1; + } + + if (uniform) { + min_size = uniform_size(max_addr, addr, 0, nr_nodes); + size = min_size; + } else { + /* + * The limit on emulated nodes is MAX_NUMNODES, so the + * size per node is increased accordingly if the + * requested size is too small. This creates a uniform + * distribution of node sizes across the entire machine + * (but not necessarily over physical nodes). + */ + min_size = uniform_size(max_addr, addr, + mem_hole_size(addr, max_addr), nr_nodes); + } + min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); if (size < min_size) { pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", size >> 20, min_size >> 20); size = min_size; } - size &= FAKE_NODE_MIN_HASH_MASK; + size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); /* * Fill physical nodes with fake nodes of size until there is no memory @@ -248,10 +281,14 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, node_clear(i, physnode_mask); continue; } + start = pi->blk[phys_blk].start; limit = pi->blk[phys_blk].end; - end = find_end_of_node(start, limit, size); + if (uniform) + end = start + size; + else + end = find_end_of_node(start, limit, size); /* * If there won't be at least FAKE_NODE_MIN_SIZE of * non-reserved memory in ZONE_DMA32 for the next node, @@ -266,7 +303,8 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - mem_hole_size(end, limit) < size) + if ((limit - end - mem_hole_size(end, limit) < size) + && !uniform) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, @@ -276,7 +314,15 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, return ret; } } - return 0; + return nid; +} + +static int __init split_nodes_size_interleave(struct numa_meminfo *ei, + struct numa_meminfo *pi, + u64 addr, u64 max_addr, u64 size) +{ + return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, + 0, NULL, NUMA_NO_NODE); } int __init setup_emu2phys_nid(int *dfl_phys_nid) @@ -346,7 +392,28 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) * the fixed node size. Otherwise, if it is just a single number N, * split the system RAM into N fake nodes. */ - if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { + if (strchr(emu_cmdline, 'U')) { + nodemask_t physnode_mask = numa_nodes_parsed; + unsigned long n; + int nid = 0; + + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); + ret = -1; + for_each_node_mask(i, physnode_mask) { + ret = split_nodes_size_interleave_uniform(&ei, &pi, + pi.blk[i].start, pi.blk[i].end, 0, + n, &pi.blk[i], nid); + if (ret < 0) + break; + if (ret < n) { + pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", + __func__, i, ret, n); + ret = -1; + break; + } + nid = ret; + } + } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { u64 size; size = memparse(emu_cmdline, &emu_cmdline); diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 55799873ebe5..8f6cc71e0848 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1441,8 +1441,8 @@ static void emit_prologue(u8 **pprog, u32 stack_depth) /* sub esp,STACK_SIZE */ EMIT2_off32(0x81, 0xEC, STACK_SIZE); - /* sub ebp,SCRATCH_SIZE+4+12*/ - EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16); + /* sub ebp,SCRATCH_SIZE+12*/ + EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 12); /* xor ebx,ebx */ EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX)); @@ -1475,8 +1475,8 @@ static void emit_epilogue(u8 **pprog, u32 stack_depth) /* mov edx,dword ptr [ebp+off]*/ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1])); - /* add ebp,SCRATCH_SIZE+4+12*/ - EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16); + /* add ebp,SCRATCH_SIZE+12*/ + EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 12); /* mov ebx,dword ptr [ebp-12]*/ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12); diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 77873ce700ae..ee5d08f25ce4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -417,7 +417,7 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va) if (!(md->attribute & EFI_MEMORY_WB)) flags |= _PAGE_PCD; - if (sev_active()) + if (sev_active() && md->type != EFI_MEMORY_MAPPED_IO) flags |= _PAGE_ENC; pfn = md->phys_addr >> PAGE_SHIFT; @@ -636,6 +636,8 @@ void efi_switch_mm(struct mm_struct *mm) #ifdef CONFIG_EFI_MIXED extern efi_status_t efi64_thunk(u32, ...); +static DEFINE_SPINLOCK(efi_runtime_lock); + #define runtime_service32(func) \ ({ \ u32 table = (u32)(unsigned long)efi.systab; \ @@ -657,17 +659,14 @@ extern efi_status_t efi64_thunk(u32, ...); #define efi_thunk(f, ...) \ ({ \ efi_status_t __s; \ - unsigned long __flags; \ u32 __func; \ \ - local_irq_save(__flags); \ arch_efi_call_virt_setup(); \ \ __func = runtime_service32(f); \ __s = efi64_thunk(__func, __VA_ARGS__); \ \ arch_efi_call_virt_teardown(); \ - local_irq_restore(__flags); \ \ __s; \ }) @@ -702,14 +701,17 @@ static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc) { efi_status_t status; u32 phys_tm, phys_tc; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_tm = virt_to_phys_or_null(tm); phys_tc = virt_to_phys_or_null(tc); status = efi_thunk(get_time, phys_tm, phys_tc); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -719,13 +721,16 @@ static efi_status_t efi_thunk_set_time(efi_time_t *tm) { efi_status_t status; u32 phys_tm; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(set_time, phys_tm); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -737,8 +742,10 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, { efi_status_t status; u32 phys_enabled, phys_pending, phys_tm; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_enabled = virt_to_phys_or_null(enabled); phys_pending = virt_to_phys_or_null(pending); @@ -747,6 +754,7 @@ efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending, status = efi_thunk(get_wakeup_time, phys_enabled, phys_pending, phys_tm); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -757,13 +765,16 @@ efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) { efi_status_t status; u32 phys_tm; + unsigned long flags; spin_lock(&rtc_lock); + spin_lock_irqsave(&efi_runtime_lock, flags); phys_tm = virt_to_phys_or_null(tm); status = efi_thunk(set_wakeup_time, enabled, phys_tm); + spin_unlock_irqrestore(&efi_runtime_lock, flags); spin_unlock(&rtc_lock); return status; @@ -781,6 +792,9 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, efi_status_t status; u32 phys_name, phys_vendor, phys_attr; u32 phys_data_size, phys_data; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_data_size = virt_to_phys_or_null(data_size); phys_vendor = virt_to_phys_or_null(vendor); @@ -791,6 +805,8 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor, status = efi_thunk(get_variable, phys_name, phys_vendor, phys_attr, phys_data_size, phys_data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -800,6 +816,34 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, { u32 phys_name, phys_vendor, phys_data; efi_status_t status; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); + + phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); + phys_vendor = virt_to_phys_or_null(vendor); + phys_data = virt_to_phys_or_null_size(data, data_size); + + /* If data_size is > sizeof(u32) we've got problems */ + status = efi_thunk(set_variable, phys_name, phys_vendor, + attr, data_size, phys_data); + + spin_unlock_irqrestore(&efi_runtime_lock, flags); + + return status; +} + +static efi_status_t +efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + u32 phys_name, phys_vendor, phys_data; + efi_status_t status; + unsigned long flags; + + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; phys_name = virt_to_phys_or_null_size(name, efi_name_size(name)); phys_vendor = virt_to_phys_or_null(vendor); @@ -809,6 +853,8 @@ efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor, status = efi_thunk(set_variable, phys_name, phys_vendor, attr, data_size, phys_data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -819,6 +865,9 @@ efi_thunk_get_next_variable(unsigned long *name_size, { efi_status_t status; u32 phys_name_size, phys_name, phys_vendor; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_name_size = virt_to_phys_or_null(name_size); phys_vendor = virt_to_phys_or_null(vendor); @@ -827,6 +876,8 @@ efi_thunk_get_next_variable(unsigned long *name_size, status = efi_thunk(get_next_variable, phys_name_size, phys_name, phys_vendor); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -835,10 +886,15 @@ efi_thunk_get_next_high_mono_count(u32 *count) { efi_status_t status; u32 phys_count; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_count = virt_to_phys_or_null(count); status = efi_thunk(get_next_high_mono_count, phys_count); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -847,10 +903,15 @@ efi_thunk_reset_system(int reset_type, efi_status_t status, unsigned long data_size, efi_char16_t *data) { u32 phys_data; + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); phys_data = virt_to_phys_or_null_size(data, data_size); efi_thunk(reset_system, reset_type, status, data_size, phys_data); + + spin_unlock_irqrestore(&efi_runtime_lock, flags); } static efi_status_t @@ -872,10 +933,40 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space, { efi_status_t status; u32 phys_storage, phys_remaining, phys_max; + unsigned long flags; + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + spin_lock_irqsave(&efi_runtime_lock, flags); + + phys_storage = virt_to_phys_or_null(storage_space); + phys_remaining = virt_to_phys_or_null(remaining_space); + phys_max = virt_to_phys_or_null(max_variable_size); + + status = efi_thunk(query_variable_info, attr, phys_storage, + phys_remaining, phys_max); + + spin_unlock_irqrestore(&efi_runtime_lock, flags); + + return status; +} + +static efi_status_t +efi_thunk_query_variable_info_nonblocking(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) +{ + efi_status_t status; + u32 phys_storage, phys_remaining, phys_max; + unsigned long flags; if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; + phys_storage = virt_to_phys_or_null(storage_space); phys_remaining = virt_to_phys_or_null(remaining_space); phys_max = virt_to_phys_or_null(max_variable_size); @@ -883,6 +974,8 @@ efi_thunk_query_variable_info(u32 attr, u64 *storage_space, status = efi_thunk(query_variable_info, attr, phys_storage, phys_remaining, phys_max); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } @@ -908,9 +1001,11 @@ void efi_thunk_runtime_setup(void) efi.get_variable = efi_thunk_get_variable; efi.get_next_variable = efi_thunk_get_next_variable; efi.set_variable = efi_thunk_set_variable; + efi.set_variable_nonblocking = efi_thunk_set_variable_nonblocking; efi.get_next_high_mono_count = efi_thunk_get_next_high_mono_count; efi.reset_system = efi_thunk_reset_system; efi.query_variable_info = efi_thunk_query_variable_info; + efi.query_variable_info_nonblocking = efi_thunk_query_variable_info_nonblocking; efi.update_capsule = efi_thunk_update_capsule; efi.query_capsule_caps = efi_thunk_query_capsule_caps; } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 36c1f8b9f7e0..844d31cb8a0c 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -105,12 +105,11 @@ early_param("efi_no_storage_paranoia", setup_storage_paranoia); */ void efi_delete_dummy_variable(void) { - efi.set_variable((efi_char16_t *)efi_dummy_name, - &EFI_DUMMY_GUID, - EFI_VARIABLE_NON_VOLATILE | - EFI_VARIABLE_BOOTSERVICE_ACCESS | - EFI_VARIABLE_RUNTIME_ACCESS, - 0, NULL); + efi.set_variable_nonblocking((efi_char16_t *)efi_dummy_name, + &EFI_DUMMY_GUID, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, 0, NULL); } /* @@ -249,7 +248,8 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) int num_entries; void *new; - if (efi_mem_desc_lookup(addr, &md)) { + if (efi_mem_desc_lookup(addr, &md) || + md.type != EFI_BOOT_SERVICES_DATA) { pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr); return; } diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index ce8da3a0412c..fd369a6e9ff8 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -137,7 +137,7 @@ ENTRY(restore_registers) /* Saved in save_processor_state. */ lgdt saved_context_gdt_desc(%rax) - xorq %rax, %rax + xorl %eax, %eax /* tell the hibernation core that we've just restored the memory */ movq %rax, in_suspend(%rip) diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 744afdc18cf3..56c44d865f7b 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -16,7 +16,7 @@ static int __init gate_vma_init(void) if (!FIXADDR_USER_START) return 0; - gate_vma.vm_mm = NULL; + vma_init(&gate_vma, NULL); gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; diff --git a/arch/x86/um/vdso/.gitignore b/arch/x86/um/vdso/.gitignore index 9cac6d072199..f8b69d84238e 100644 --- a/arch/x86/um/vdso/.gitignore +++ b/arch/x86/um/vdso/.gitignore @@ -1,2 +1 @@ -vdso-syms.lds vdso.lds diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index b2d6967262b2..822ccdba93ad 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -53,22 +53,6 @@ $(vobjs): KBUILD_CFLAGS += $(CFL) CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage -targets += vdso-syms.lds -extra-$(VDSO64-y) += vdso-syms.lds - -# -# Match symbols in the DSO that look like VDSO*; produce a file of constants. -# -sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' -quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ -endef - -$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE - $(call if_changed,vdsosym) - # # The DSO images are built using a special linker script. # |