diff options
Diffstat (limited to 'arch/x86/boot/compressed')
| -rw-r--r-- | arch/x86/boot/compressed/Makefile | 19 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/cmdline.c | 2 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/eboot.c | 660 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/eboot.h | 12 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/head_64.S | 242 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/kaslr.c | 136 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/kaslr_64.c (renamed from arch/x86/boot/compressed/pagetable.c) | 19 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/mem_encrypt.S | 36 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/misc.c | 22 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/misc.h | 6 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/pgtable.h | 20 | ||||
| -rw-r--r-- | arch/x86/boot/compressed/pgtable_64.c | 185 |
12 files changed, 757 insertions, 602 deletions
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f25e1530e064..466f66c8a7f8 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -26,7 +26,7 @@ KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 -KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2 +KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING cflags-$(CONFIG_X86_32) := -march=i386 @@ -37,21 +37,22 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding) KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE :=n -LDFLAGS := -m elf_$(UTS_MACHINE) +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) # Compressed kernel should be built as PIE since it may be loaded at any # address by the bootloader. ifeq ($(CONFIG_X86_32),y) -LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) +KBUILD_LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) else # To build 64-bit compressed kernel as PIE, we disable relocation # overflow check to avoid relocation overflow error with a new linker # command-line option, -z noreloc-overflow. -LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ +KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ && echo "-z noreloc-overflow -pie --no-dynamic-linker") endif LDFLAGS_vmlinux := -T @@ -78,7 +79,7 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o ifdef CONFIG_X86_64 - vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o + vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o vmlinux-objs-y += $(obj)/mem_encrypt.o vmlinux-objs-y += $(obj)/pgtable_64.o endif @@ -106,9 +107,13 @@ define cmd_check_data_rel done endef +# We need to run two commands under "if_changed", so merge them into a +# single invocation. +quiet_cmd_check-and-link-vmlinux = LD $@ + cmd_check-and-link-vmlinux = $(cmd_check_data_rel); $(cmd_ld) + $(obj)/vmlinux: $(vmlinux-objs-y) FORCE - $(call if_changed,check_data_rel) - $(call if_changed,ld) + $(call if_changed,check-and-link-vmlinux) OBJCOPYFLAGS_vmlinux.bin := -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index 0cb325734cfb..af6cda0b7900 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" -#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE +#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL static unsigned long fs; static inline void set_fs(unsigned long seg) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 886a9115af62..8b4c5e001157 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -34,74 +34,13 @@ static void setup_boot_services##bits(struct efi_config *c) \ \ table = (typeof(table))sys_table; \ \ - c->runtime_services = table->runtime; \ - c->boot_services = table->boottime; \ - c->text_output = table->con_out; \ + c->runtime_services = table->runtime; \ + c->boot_services = table->boottime; \ + c->text_output = table->con_out; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); -static inline efi_status_t __open_volume32(void *__image, void **__fh) -{ - efi_file_io_interface_t *io; - efi_loaded_image_32_t *image = __image; - efi_file_handle_32_t *fh; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - efi_status_t status; - void *handle = (void *)(unsigned long)image->device_handle; - unsigned long func; - - status = efi_call_early(handle_protocol, handle, - &fs_proto, (void **)&io); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to handle fs_proto\n"); - return status; - } - - func = (unsigned long)io->open_volume; - status = efi_early->call(func, io, &fh); - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to open volume\n"); - - *__fh = fh; - return status; -} - -static inline efi_status_t __open_volume64(void *__image, void **__fh) -{ - efi_file_io_interface_t *io; - efi_loaded_image_64_t *image = __image; - efi_file_handle_64_t *fh; - efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; - efi_status_t status; - void *handle = (void *)(unsigned long)image->device_handle; - unsigned long func; - - status = efi_call_early(handle_protocol, handle, - &fs_proto, (void **)&io); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to handle fs_proto\n"); - return status; - } - - func = (unsigned long)io->open_volume; - status = efi_early->call(func, io, &fh); - if (status != EFI_SUCCESS) - efi_printk(sys_table, "Failed to open volume\n"); - - *__fh = fh; - return status; -} - -efi_status_t -efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh) -{ - if (efi_early->is64) - return __open_volume64(__image, __fh); - - return __open_volume32(__image, __fh); -} - void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) { efi_call_proto(efi_simple_text_output_protocol, output_string, @@ -109,61 +48,69 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } static efi_status_t -__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) +preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) { struct pci_setup_rom *rom = NULL; efi_status_t status; unsigned long size; - uint64_t attributes; + uint64_t romsize; + void *romimage; - status = efi_early->call(pci->attributes, pci, - EfiPciIoAttributeOperationGet, 0, 0, - &attributes); - if (status != EFI_SUCCESS) - return status; - - if (!pci->romimage || !pci->romsize) + /* + * Some firmware images contain EFI function pointers at the place where + * the romimage and romsize fields are supposed to be. Typically the EFI + * code is mapped at high addresses, translating to an unrealistically + * large romsize. The UEFI spec limits the size of option ROMs to 16 + * MiB so we reject any ROMs over 16 MiB in size to catch this. + */ + romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol, + romimage, pci); + romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci); + if (!romimage || !romsize || romsize > SZ_16M) return EFI_INVALID_PARAMETER; - size = pci->romsize + sizeof(*rom); + size = romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for rom\n"); + efi_printk(sys_table, "Failed to allocate memory for 'rom'\n"); return status; } memset(rom, 0, sizeof(*rom)); - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; + rom->data.type = SETUP_PCI; + rom->data.len = size - sizeof(struct setup_data); + rom->data.next = 0; + rom->pcilen = pci->romsize; *__rom = rom; - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_VENDOR_ID, 1, &(rom->vendor)); + status = efi_call_proto(efi_pci_io_protocol, pci.read, pci, + EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, + &rom->vendor); if (status != EFI_SUCCESS) { efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; } - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_DEVICE_ID, 1, &(rom->devid)); + status = efi_call_proto(efi_pci_io_protocol, pci.read, pci, + EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, + &rom->devid); if (status != EFI_SUCCESS) { efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; } - status = efi_early->call(pci->get_location, pci, &(rom->segment), - &(rom->bus), &(rom->device), &(rom->function)); + status = efi_call_proto(efi_pci_io_protocol, get_location, pci, + &rom->segment, &rom->bus, &rom->device, + &rom->function); if (status != EFI_SUCCESS) goto free_struct; - memcpy(rom->romdata, pci->romimage, pci->romsize); + memcpy(rom->romdata, romimage, romsize); return status; free_struct: @@ -171,158 +118,6 @@ free_struct: return status; } -static void -setup_efi_pci32(struct boot_params *params, void **pci_handle, - unsigned long size) -{ - efi_pci_io_protocol_32 *pci = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 *handles = (u32 *)(unsigned long)pci_handle; - efi_status_t status; - unsigned long nr_pci; - struct setup_data *data; - int i; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - nr_pci = size / sizeof(u32); - for (i = 0; i < nr_pci; i++) { - struct pci_setup_rom *rom = NULL; - u32 h = handles[i]; - - status = efi_call_early(handle_protocol, h, - &pci_proto, (void **)&pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - - status = __setup_efi_pci32(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - } -} - -static efi_status_t -__setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) -{ - struct pci_setup_rom *rom; - efi_status_t status; - unsigned long size; - uint64_t attributes; - - status = efi_early->call(pci->attributes, pci, - EfiPciIoAttributeOperationGet, 0, - &attributes); - if (status != EFI_SUCCESS) - return status; - - if (!pci->romimage || !pci->romsize) - return EFI_INVALID_PARAMETER; - - size = pci->romsize + sizeof(*rom); - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for rom\n"); - return status; - } - - rom->data.type = SETUP_PCI; - rom->data.len = size - sizeof(struct setup_data); - rom->data.next = 0; - rom->pcilen = pci->romsize; - *__rom = rom; - - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_VENDOR_ID, 1, &(rom->vendor)); - - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to read rom->vendor\n"); - goto free_struct; - } - - status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, - PCI_DEVICE_ID, 1, &(rom->devid)); - - if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to read rom->devid\n"); - goto free_struct; - } - - status = efi_early->call(pci->get_location, pci, &(rom->segment), - &(rom->bus), &(rom->device), &(rom->function)); - - if (status != EFI_SUCCESS) - goto free_struct; - - memcpy(rom->romdata, pci->romimage, pci->romsize); - return status; - -free_struct: - efi_call_early(free_pool, rom); - return status; - -} - -static void -setup_efi_pci64(struct boot_params *params, void **pci_handle, - unsigned long size) -{ - efi_pci_io_protocol_64 *pci = NULL; - efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; - u64 *handles = (u64 *)(unsigned long)pci_handle; - efi_status_t status; - unsigned long nr_pci; - struct setup_data *data; - int i; - - data = (struct setup_data *)(unsigned long)params->hdr.setup_data; - - while (data && data->next) - data = (struct setup_data *)(unsigned long)data->next; - - nr_pci = size / sizeof(u64); - for (i = 0; i < nr_pci; i++) { - struct pci_setup_rom *rom = NULL; - u64 h = handles[i]; - - status = efi_call_early(handle_protocol, h, - &pci_proto, (void **)&pci); - - if (status != EFI_SUCCESS) - continue; - - if (!pci) - continue; - - status = __setup_efi_pci64(pci, &rom); - if (status != EFI_SUCCESS) - continue; - - if (data) - data->next = (unsigned long)rom; - else - params->hdr.setup_data = (unsigned long)rom; - - data = (struct setup_data *)rom; - - } -} - /* * There's no way to return an informative status from this function, * because any analysis (and printing of error messages) needs to be @@ -338,6 +133,9 @@ static void setup_efi_pci(struct boot_params *params) void **pci_handle = NULL; efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID; unsigned long size = 0; + unsigned long nr_pci; + struct setup_data *data; + int i; status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, @@ -349,7 +147,7 @@ static void setup_efi_pci(struct boot_params *params) size, (void **)&pci_handle); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for pci_handle\n"); + efi_printk(sys_table, "Failed to allocate memory for 'pci_handle'\n"); return; } @@ -361,10 +159,34 @@ static void setup_efi_pci(struct boot_params *params) if (status != EFI_SUCCESS) goto free_handle; - if (efi_early->is64) - setup_efi_pci64(params, pci_handle, size); - else - setup_efi_pci32(params, pci_handle, size); + data = (struct setup_data *)(unsigned long)params->hdr.setup_data; + + while (data && data->next) + data = (struct setup_data *)(unsigned long)data->next; + + nr_pci = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32)); + for (i = 0; i < nr_pci; i++) { + efi_pci_io_protocol_t *pci = NULL; + struct pci_setup_rom *rom; + + status = efi_call_early(handle_protocol, + efi_is_64bit() ? ((u64 *)pci_handle)[i] + : ((u32 *)pci_handle)[i], + &pci_proto, (void **)&pci); + if (status != EFI_SUCCESS || !pci) + continue; + + status = preserve_pci_rom_image(pci, &rom); + if (status != EFI_SUCCESS) + continue; + + if (data) + data->next = (unsigned long)rom; + else + params->hdr.setup_data = (unsigned long)rom; + + data = (struct setup_data *)rom; + } free_handle: efi_call_early(free_pool, pci_handle); @@ -395,8 +217,7 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size + sizeof(struct setup_data), &new); if (status != EFI_SUCCESS) { - efi_printk(sys_table, - "Failed to alloc mem for properties\n"); + efi_printk(sys_table, "Failed to allocate memory for 'properties'\n"); return; } @@ -412,18 +233,19 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params) new->next = 0; data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; - if (!data) + if (!data) { boot_params->hdr.setup_data = (unsigned long)new; - else { + } else { while (data->next) data = (struct setup_data *)(unsigned long)data->next; data->next = (unsigned long)new; } } +static const efi_char16_t apple[] = L"Apple"; + static void setup_quirks(struct boot_params *boot_params) { - efi_char16_t const apple[] = { 'A', 'p', 'p', 'l', 'e', 0 }; efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) efi_table_attr(efi_system_table, fw_vendor, sys_table); @@ -433,81 +255,55 @@ static void setup_quirks(struct boot_params *boot_params) } } +/* + * See if we have Universal Graphics Adapter (UGA) protocol + */ static efi_status_t -setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) +setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size) { - struct efi_uga_draw_protocol *uga = NULL, *first_uga; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; + efi_status_t status; + u32 width, height; + void **uga_handle = NULL; + efi_uga_draw_protocol_t *uga = NULL, *first_uga; unsigned long nr_ugas; - u32 *handles = (u32 *)uga_handle; - efi_status_t status = EFI_INVALID_PARAMETER; int i; - first_uga = NULL; - nr_ugas = size / sizeof(u32); - for (i = 0; i < nr_ugas; i++) { - efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; - u32 w, h, depth, refresh; - void *pciio; - u32 handle = handles[i]; - - status = efi_call_early(handle_protocol, handle, - &uga_proto, (void **)&uga); - if (status != EFI_SUCCESS) - continue; - - efi_call_early(handle_protocol, handle, &pciio_proto, &pciio); - - status = efi_early->call((unsigned long)uga->get_mode, uga, - &w, &h, &depth, &refresh); - if (status == EFI_SUCCESS && (!first_uga || pciio)) { - *width = w; - *height = h; - - /* - * Once we've found a UGA supporting PCIIO, - * don't bother looking any further. - */ - if (pciio) - break; - - first_uga = uga; - } - } + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size, (void **)&uga_handle); + if (status != EFI_SUCCESS) + return status; - return status; -} + status = efi_call_early(locate_handle, + EFI_LOCATE_BY_PROTOCOL, + uga_proto, NULL, &size, uga_handle); + if (status != EFI_SUCCESS) + goto free_handle; -static efi_status_t -setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) -{ - struct efi_uga_draw_protocol *uga = NULL, *first_uga; - efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; - unsigned long nr_ugas; - u64 *handles = (u64 *)uga_handle; - efi_status_t status = EFI_INVALID_PARAMETER; - int i; + height = 0; + width = 0; first_uga = NULL; - nr_ugas = size / sizeof(u64); + nr_ugas = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32)); for (i = 0; i < nr_ugas; i++) { efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; u32 w, h, depth, refresh; void *pciio; - u64 handle = handles[i]; + unsigned long handle = efi_is_64bit() ? ((u64 *)uga_handle)[i] + : ((u32 *)uga_handle)[i]; status = efi_call_early(handle_protocol, handle, - &uga_proto, (void **)&uga); + uga_proto, (void **)&uga); if (status != EFI_SUCCESS) continue; + pciio = NULL; efi_call_early(handle_protocol, handle, &pciio_proto, &pciio); - status = efi_early->call((unsigned long)uga->get_mode, uga, - &w, &h, &depth, &refresh); + status = efi_call_proto(efi_uga_draw_protocol, get_mode, uga, + &w, &h, &depth, &refresh); if (status == EFI_SUCCESS && (!first_uga || pciio)) { - *width = w; - *height = h; + width = w; + height = h; /* * Once we've found a UGA supporting PCIIO, @@ -520,59 +316,28 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) } } - return status; -} - -/* - * See if we have Universal Graphics Adapter (UGA) protocol - */ -static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, - unsigned long size) -{ - efi_status_t status; - u32 width, height; - void **uga_handle = NULL; - - status = efi_call_early(allocate_pool, EFI_LOADER_DATA, - size, (void **)&uga_handle); - if (status != EFI_SUCCESS) - return status; - - status = efi_call_early(locate_handle, - EFI_LOCATE_BY_PROTOCOL, - uga_proto, NULL, &size, uga_handle); - if (status != EFI_SUCCESS) - goto free_handle; - - height = 0; - width = 0; - - if (efi_early->is64) - status = setup_uga64(uga_handle, size, &width, &height); - else - status = setup_uga32(uga_handle, size, &width, &height); - if (!width && !height) goto free_handle; /* EFI framebuffer */ - si->orig_video_isVGA = VIDEO_TYPE_EFI; + si->orig_video_isVGA = VIDEO_TYPE_EFI; - si->lfb_depth = 32; - si->lfb_width = width; - si->lfb_height = height; + si->lfb_depth = 32; + si->lfb_width = width; + si->lfb_height = height; - si->red_size = 8; - si->red_pos = 16; - si->green_size = 8; - si->green_pos = 8; - si->blue_size = 8; - si->blue_pos = 0; - si->rsvd_size = 8; - si->rsvd_pos = 24; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; free_handle: efi_call_early(free_pool, uga_handle); + return status; } @@ -639,7 +404,7 @@ struct boot_params *make_boot_params(struct efi_config *c) if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) return NULL; - if (efi_early->is64) + if (efi_is_64bit()) setup_boot_services64(efi_early); else setup_boot_services32(efi_early); @@ -654,7 +419,7 @@ struct boot_params *make_boot_params(struct efi_config *c) status = efi_low_alloc(sys_table, 0x4000, 1, (unsigned long *)&boot_params); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc lowmem for boot params\n"); + efi_printk(sys_table, "Failed to allocate lowmem for boot params\n"); return NULL; } @@ -670,9 +435,9 @@ struct boot_params *make_boot_params(struct efi_config *c) * Fill out some of the header fields ourselves because the * EFI firmware loader doesn't load the first sector. */ - hdr->root_flags = 1; - hdr->vid_mode = 0xffff; - hdr->boot_flag = 0xAA55; + hdr->root_flags = 1; + hdr->vid_mode = 0xffff; + hdr->boot_flag = 0xAA55; hdr->type_of_loader = 0x21; @@ -680,6 +445,7 @@ struct boot_params *make_boot_params(struct efi_config *c) cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size); if (!cmdline_ptr) goto fail; + hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; /* Fill in upper bits of command line address, NOP on 32 bit */ boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; @@ -716,10 +482,12 @@ struct boot_params *make_boot_params(struct efi_config *c) boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32; return boot_params; + fail2: efi_free(sys_table, options_size, hdr->cmd_line_ptr); fail: efi_free(sys_table, 0x4000, (unsigned long)boot_params); + return NULL; } @@ -731,7 +499,7 @@ static void add_e820ext(struct boot_params *params, unsigned long size; e820ext->type = SETUP_E820_EXT; - e820ext->len = nr_entries * sizeof(struct boot_e820_entry); + e820ext->len = nr_entries * sizeof(struct boot_e820_entry); e820ext->next = 0; data = (struct setup_data *)(unsigned long)params->hdr.setup_data; @@ -745,8 +513,8 @@ static void add_e820ext(struct boot_params *params, params->hdr.setup_data = (unsigned long)e820ext; } -static efi_status_t setup_e820(struct boot_params *params, - struct setup_data *e820ext, u32 e820ext_size) +static efi_status_t +setup_e820(struct boot_params *params, struct setup_data *e820ext, u32 e820ext_size) { struct boot_e820_entry *entry = params->e820_table; struct efi_info *efi = ¶ms->efi_info; @@ -867,11 +635,10 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, } struct exit_boot_struct { - struct boot_params *boot_params; - struct efi_info *efi; - struct setup_data *e820ext; - __u32 e820ext_size; - bool is64; + struct boot_params *boot_params; + struct efi_info *efi; + struct setup_data *e820ext; + __u32 e820ext_size; }; static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, @@ -898,25 +665,25 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, first = false; } - signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + signature = efi_is_64bit() ? EFI64_LOADER_SIGNATURE + : EFI32_LOADER_SIGNATURE; memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - p->efi->efi_systab = (unsigned long)sys_table_arg; - p->efi->efi_memdesc_size = *map->desc_size; - p->efi->efi_memdesc_version = *map->desc_ver; - p->efi->efi_memmap = (unsigned long)*map->map; - p->efi->efi_memmap_size = *map->map_size; + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; #ifdef CONFIG_X86_64 - p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; - p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; #endif return EFI_SUCCESS; } -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle, bool is64) +static efi_status_t exit_boot(struct boot_params *boot_params, void *handle) { unsigned long map_sz, key, desc_size, buff_size; efi_memory_desc_t *mem_map; @@ -927,17 +694,16 @@ static efi_status_t exit_boot(struct boot_params *boot_params, struct efi_boot_memmap map; struct exit_boot_struct priv; - map.map = &mem_map; - map.map_size = &map_sz; - map.desc_size = &desc_size; - map.desc_ver = &desc_version; - map.key_ptr = &key; - map.buff_size = &buff_size; - priv.boot_params = boot_params; - priv.efi = &boot_params->efi_info; - priv.e820ext = NULL; - priv.e820ext_size = 0; - priv.is64 = is64; + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; /* Might as well exit boot services now */ status = efi_exit_boot_services(sys_table, handle, &map, &priv, @@ -945,10 +711,11 @@ static efi_status_t exit_boot(struct boot_params *boot_params, if (status != EFI_SUCCESS) return status; - e820ext = priv.e820ext; - e820ext_size = priv.e820ext_size; + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; + /* Historic? */ - boot_params->alt_mem_k = 32 * 1024; + boot_params->alt_mem_k = 32 * 1024; status = setup_e820(boot_params, e820ext, e820ext_size); if (status != EFI_SUCCESS) @@ -961,8 +728,8 @@ static efi_status_t exit_boot(struct boot_params *boot_params, * On success we return a pointer to a boot_params structure, and NULL * on failure. */ -struct boot_params *efi_main(struct efi_config *c, - struct boot_params *boot_params) +struct boot_params * +efi_main(struct efi_config *c, struct boot_params *boot_params) { struct desc_ptr *gdt = NULL; efi_loaded_image_t *image; @@ -971,13 +738,12 @@ struct boot_params *efi_main(struct efi_config *c, struct desc_struct *desc; void *handle; efi_system_table_t *_table; - bool is64; + unsigned long cmdline_paddr; efi_early = c; _table = (efi_system_table_t *)(unsigned long)efi_early->table; handle = (void *)(unsigned long)efi_early->image_handle; - is64 = efi_early->is64; sys_table = _table; @@ -985,12 +751,21 @@ struct boot_params *efi_main(struct efi_config *c, if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) goto fail; - if (is64) + if (efi_is_64bit()) setup_boot_services64(efi_early); else setup_boot_services32(efi_early); /* + * make_boot_params() may have been called before efi_main(), in which + * case this is the second time we parse the cmdline. This is ok, + * parsing the cmdline multiple times does not have side-effects. + */ + cmdline_paddr = ((u64)hdr->cmd_line_ptr | + ((u64)boot_params->ext_cmd_line_ptr << 32)); + efi_parse_options((char *)cmdline_paddr); + + /* * If the boot loader gave us a value for secure_boot then we use that, * otherwise we ask the BIOS. */ @@ -1010,7 +785,7 @@ struct boot_params *efi_main(struct efi_config *c, status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for gdt structure\n"); + efi_printk(sys_table, "Failed to allocate memory for 'gdt' structure\n"); goto fail; } @@ -1018,7 +793,7 @@ struct boot_params *efi_main(struct efi_config *c, status = efi_low_alloc(sys_table, gdt->size, 8, (unsigned long *)&gdt->address); if (status != EFI_SUCCESS) { - efi_printk(sys_table, "Failed to alloc mem for gdt\n"); + efi_printk(sys_table, "Failed to allocate memory for 'gdt'\n"); goto fail; } @@ -1041,7 +816,7 @@ struct boot_params *efi_main(struct efi_config *c, hdr->code32_start = bzimage_addr; } - status = exit_boot(boot_params, handle, is64); + status = exit_boot(boot_params, handle); if (status != EFI_SUCCESS) { efi_printk(sys_table, "exit_boot() failed!\n"); goto fail; @@ -1055,19 +830,20 @@ struct boot_params *efi_main(struct efi_config *c, if (IS_ENABLED(CONFIG_X86_64)) { /* __KERNEL32_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + desc++; } else { /* Second entry is unused on 32-bit */ @@ -1075,15 +851,16 @@ struct boot_params *efi_main(struct efi_config *c, } /* __KERNEL_CS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + if (IS_ENABLED(CONFIG_X86_64)) { desc->l = 1; desc->d = 0; @@ -1091,41 +868,41 @@ struct boot_params *efi_main(struct efi_config *c, desc->l = 0; desc->d = SEG_OP_SIZE_32BIT; } - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; /* __KERNEL_DS */ - desc->limit0 = 0xffff; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; - desc->s = DESC_TYPE_CODE_DATA; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0xf; - desc->avl = 0; - desc->l = 0; - desc->d = SEG_OP_SIZE_32BIT; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; if (IS_ENABLED(CONFIG_X86_64)) { /* Task segment value */ - desc->limit0 = 0x0000; - desc->base0 = 0x0000; - desc->base1 = 0x0000; - desc->type = SEG_TYPE_TSS; - desc->s = 0; - desc->dpl = 0; - desc->p = 1; - desc->limit1 = 0x0; - desc->avl = 0; - desc->l = 0; - desc->d = 0; - desc->g = SEG_GRANULARITY_4KB; - desc->base2 = 0x00; + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit1 = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; desc++; } @@ -1135,5 +912,6 @@ struct boot_params *efi_main(struct efi_config *c, return boot_params; fail: efi_printk(sys_table, "efi_main() failed!\n"); + return NULL; } diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index e799dc5c6448..8297387c4676 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -12,22 +12,22 @@ #define DESC_TYPE_CODE_DATA (1 << 0) -struct efi_uga_draw_protocol_32 { +typedef struct { u32 get_mode; u32 set_mode; u32 blt; -}; +} efi_uga_draw_protocol_32_t; -struct efi_uga_draw_protocol_64 { +typedef struct { u64 get_mode; u64 set_mode; u64 blt; -}; +} efi_uga_draw_protocol_64_t; -struct efi_uga_draw_protocol { +typedef struct { void *get_mode; void *set_mode; void *blt; -}; +} efi_uga_draw_protocol_t; #endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fc313e29fe2c..64037895b085 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -33,6 +33,7 @@ #include <asm/processor-flags.h> #include <asm/asm-offsets.h> #include <asm/bootparam.h> +#include "pgtable.h" /* * Locally defined symbols should be marked hidden: @@ -304,60 +305,121 @@ ENTRY(startup_64) /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp -#ifdef CONFIG_X86_5LEVEL /* - * Check if we need to enable 5-level paging. - * RSI holds real mode data and need to be preserved across - * a function call. + * paging_prepare() and cleanup_trampoline() below can have GOT + * references. Adjust the table with address we are running at. + * + * Zero RAX for adjust_got: the GOT was not adjusted before; + * there's no adjustment to undo. */ - pushq %rsi - call l5_paging_required - popq %rsi + xorq %rax, %rax - /* If l5_paging_required() returned zero, we're done here. */ - cmpq $0, %rax - je lvl5 + /* + * Calculate the address the binary is loaded at and use it as + * a GOT adjustment. + */ + call 1f +1: popq %rdi + subq $1b, %rdi + + call adjust_got /* * At this point we are in long mode with 4-level paging enabled, - * but we want to enable 5-level paging. + * but we might want to enable 5-level paging or vice versa. + * + * The problem is that we cannot do it directly. Setting or clearing + * CR4.LA57 in long mode would trigger #GP. So we need to switch off + * long mode and paging first. * - * The problem is that we cannot do it directly. Setting LA57 in - * long mode would trigger #GP. So we need to switch off long mode - * first. + * We also need a trampoline in lower memory to switch over from + * 4- to 5-level paging for cases when the bootloader puts the kernel + * above 4G, but didn't enable 5-level paging for us. * - * NOTE: This is not going to work if bootloader put us above 4G - * limit. + * The same trampoline can be used to switch from 5- to 4-level paging + * mode, like when starting 4-level paging kernel via kexec() when + * original kernel worked in 5-level paging mode. * - * The first step is go into compatibility mode. + * For the trampoline, we need the top page table to reside in lower + * memory as we don't have a way to load 64-bit values into CR3 in + * 32-bit mode. + * + * We go though the trampoline even if we don't have to: if we're + * already in a desired paging mode. This way the trampoline code gets + * tested on every boot. */ - /* Clear additional page table */ - leaq lvl5_pgtable(%rbx), %rdi - xorq %rax, %rax - movq $(PAGE_SIZE/8), %rcx - rep stosq + /* Make sure we have GDT with 32-bit code segment */ + leaq gdt(%rip), %rax + movq %rax, gdt64+2(%rip) + lgdt gdt64(%rip) /* - * Setup current CR3 as the first and only entry in a new top level - * page table. + * paging_prepare() sets up the trampoline and checks if we need to + * enable 5-level paging. + * + * Address of the trampoline is returned in RAX. + * Non zero RDX on return means we need to enable 5-level paging. + * + * RSI holds real mode data and needs to be preserved across + * this function call. */ - movq %cr3, %rdi - leaq 0x7 (%rdi), %rax - movq %rax, lvl5_pgtable(%rbx) + pushq %rsi + movq %rsi, %rdi /* real mode address */ + call paging_prepare + popq %rsi + + /* Save the trampoline address in RCX */ + movq %rax, %rcx + + /* + * Load the address of trampoline_return() into RDI. + * It will be used by the trampoline to return to the main code. + */ + leaq trampoline_return(%rip), %rdi /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ pushq $__KERNEL32_CS - leaq compatible_mode(%rip), %rax + leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax pushq %rax lretq -lvl5: -#endif +trampoline_return: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * + * RDI is address of the page table to use instead of page table + * in trampoline memory (if required). + * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi + leaq top_pgtable(%rbx), %rdi + call cleanup_trampoline + popq %rsi /* Zero EFLAGS */ pushq $0 popfq + /* + * Previously we've adjusted the GOT with address the binary was + * loaded at. Now we need to re-adjust for relocation address. + * + * Calculate the address the binary is loaded at, so that we can + * undo the previous GOT adjustment. + */ + call 1f +1: popq %rax + subq $1b, %rax + + /* The new adjustment is the relocation address */ + movq %rbx, %rdi + call adjust_got + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -459,19 +521,6 @@ relocated: rep stosq /* - * Adjust our own GOT - */ - leaq _got(%rip), %rdx - leaq _egot(%rip), %rcx -1: - cmpq %rcx, %rdx - jae 2f - addq %rbx, (%rdx) - addq $8, %rdx - jmp 1b -2: - -/* * Do the extraction, and jump to the new kernel.. */ pushq %rsi /* Save the real mode argument */ @@ -489,47 +538,104 @@ relocated: */ jmp *%rax +/* + * Adjust the global offset table + * + * RAX is the previous adjustment of the table to undo (use 0 if it's the + * first time we touch GOT). + * RDI is the new adjustment to apply. + */ +adjust_got: + /* Walk through the GOT adding the address to the entries */ + leaq _got(%rip), %rdx + leaq _egot(%rip), %rcx +1: + cmpq %rcx, %rdx + jae 2f + subq %rax, (%rdx) /* Undo previous adjustment */ + addq %rdi, (%rdx) /* Apply the new adjustment */ + addq $8, %rdx + jmp 1b +2: + ret + .code32 -#ifdef CONFIG_X86_5LEVEL -compatible_mode: - /* Setup data and stack segments */ +/* + * This is the 32-bit trampoline that will be copied over to low memory. + * + * RDI contains the return address (might be above 4G). + * ECX contains the base address of the trampoline memory. + * Non zero RDX on return means we need to enable 5-level paging. + */ +ENTRY(trampoline_32bit_src) + /* Set up data and stack segments */ movl $__KERNEL_DS, %eax movl %eax, %ds movl %eax, %ss + /* Set up new stack */ + leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp + /* Disable paging */ movl %cr0, %eax btrl $X86_CR0_PG_BIT, %eax movl %eax, %cr0 - /* Point CR3 to 5-level paging */ - leal lvl5_pgtable(%ebx), %eax - movl %eax, %cr3 + /* Check what paging mode we want to be in after the trampoline */ + cmpl $0, %edx + jz 1f - /* Enable PAE and LA57 mode */ + /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ movl %cr4, %eax - orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + testl $X86_CR4_LA57, %eax + jnz 3f + jmp 2f +1: + /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ + movl %cr4, %eax + testl $X86_CR4_LA57, %eax + jz 3f +2: + /* Point CR3 to the trampoline's new top level page table */ + leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax + movl %eax, %cr3 +3: + /* Enable PAE and LA57 (if required) paging modes */ + movl $X86_CR4_PAE, %eax + cmpl $0, %edx + jz 1f + orl $X86_CR4_LA57, %eax +1: movl %eax, %cr4 - /* Calculate address we are running at */ - call 1f -1: popl %edi - subl $1b, %edi + /* Calculate address of paging_enabled() once we are executing in the trampoline */ + leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax - /* Prepare stack for far return to Long Mode */ + /* Prepare the stack for far return to Long Mode */ pushl $__KERNEL_CS - leal lvl5(%edi), %eax - push %eax + pushl %eax - /* Enable paging back */ + /* Enable paging again */ movl $(X86_CR0_PG | X86_CR0_PE), %eax movl %eax, %cr0 lret -#endif + .code64 +paging_enabled: + /* Return from the trampoline */ + jmp *%rdi + + /* + * The trampoline code has a size limit. + * Make sure we fail to compile if the trampoline code grows + * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. + */ + .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE + + .code32 no_longmode: - /* This isn't an x86-64 CPU so hang */ + /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ 1: hlt jmp 1b @@ -537,6 +643,11 @@ no_longmode: #include "../../kernel/verify_cpu.S" .data +gdt64: + .word gdt_end - gdt + .long 0 + .word 0 + .quad 0 gdt: .word gdt_end - gdt .long gdt @@ -585,7 +696,10 @@ boot_stack_end: .balign 4096 pgtable: .fill BOOT_PGT_SIZE, 1, 0 -#ifdef CONFIG_X86_5LEVEL -lvl5_pgtable: + +/* + * The page table is going to be used instead of page table in the trampoline + * memory. + */ +top_pgtable: .fill PAGE_SIZE, 1, 0 -#endif diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 8199a6187251..9ed9709d9947 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -23,11 +23,8 @@ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL * which is meaningless and will cause compiling error in some cases. - * So do not include linux/export.h and define EXPORT_SYMBOL(sym) - * as empty. */ -#define _LINUX_EXPORT_H -#define EXPORT_SYMBOL(sym) +#define __DISABLE_EXPORTS #include "misc.h" #include "error.h" @@ -46,8 +43,17 @@ #define STATIC #include <linux/decompress/mm.h> +#ifdef CONFIG_X86_5LEVEL +unsigned int __pgtable_l5_enabled; +unsigned int pgdir_shift __ro_after_init = 39; +unsigned int ptrs_per_p4d __ro_after_init = 1; +#endif + extern unsigned long get_cmd_line_ptr(void); +/* Used by PAGE_KERN* macros: */ +pteval_t __default_kernel_pte_mask __read_mostly = ~0; + /* Simplified build-specific string for starting entropy. */ static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION; @@ -93,7 +99,7 @@ static bool memmap_too_large; /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ -unsigned long long mem_limit = ULLONG_MAX; +static unsigned long long mem_limit = ULLONG_MAX; enum mem_avoid_index { @@ -206,7 +212,36 @@ static void mem_avoid_memmap(char *str) memmap_too_large = true; } -static int handle_mem_memmap(void) +/* Store the number of 1GB huge pages which users specified: */ +static unsigned long max_gb_huge_pages; + +static void parse_gb_huge_pages(char *param, char *val) +{ + static bool gbpage_sz; + char *p; + + if (!strcmp(param, "hugepagesz")) { + p = val; + if (memparse(p, &p) != PUD_SIZE) { + gbpage_sz = false; + return; + } + + if (gbpage_sz) + warn("Repeatedly set hugeTLB page size of 1G!\n"); + gbpage_sz = true; + return; + } + + if (!strcmp(param, "hugepages") && gbpage_sz) { + p = val; + max_gb_huge_pages = simple_strtoull(p, &p, 0); + return; + } +} + + +static void handle_mem_options(void) { char *args = (char *)get_cmd_line_ptr(); size_t len = strlen((char *)args); @@ -214,8 +249,9 @@ static int handle_mem_memmap(void) char *param, *val; u64 mem_size; - if (!strstr(args, "memmap=") && !strstr(args, "mem=")) - return 0; + if (!strstr(args, "memmap=") && !strstr(args, "mem=") && + !strstr(args, "hugepages")) + return; tmp_cmdline = malloc(len + 1); if (!tmp_cmdline) @@ -233,28 +269,29 @@ static int handle_mem_memmap(void) /* Stop at -- */ if (!val && strcmp(param, "--") == 0) { warn("Only '--' specified in cmdline"); - free(tmp_cmdline); - return -1; + goto out; } if (!strcmp(param, "memmap")) { mem_avoid_memmap(val); + } else if (strstr(param, "hugepages")) { + parse_gb_huge_pages(param, val); } else if (!strcmp(param, "mem")) { char *p = val; if (!strcmp(p, "nopentium")) continue; mem_size = memparse(p, &p); - if (mem_size == 0) { - free(tmp_cmdline); - return -EINVAL; - } + if (mem_size == 0) + goto out; + mem_limit = mem_size; } } +out: free(tmp_cmdline); - return 0; + return; } /* @@ -378,7 +415,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* We don't need to set a mapping for setup_data. */ /* Mark the memmap regions we need to avoid */ - handle_mem_memmap(); + handle_mem_options(); #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ @@ -457,6 +494,60 @@ static void store_slot_info(struct mem_vector *region, unsigned long image_size) } } +/* + * Skip as many 1GB huge pages as possible in the passed region + * according to the number which users specified: + */ +static void +process_gb_huge_pages(struct mem_vector *region, unsigned long image_size) +{ + unsigned long addr, size = 0; + struct mem_vector tmp; + int i = 0; + + if (!max_gb_huge_pages) { + store_slot_info(region, image_size); + return; + } + + addr = ALIGN(region->start, PUD_SIZE); + /* Did we raise the address above the passed in memory entry? */ + if (addr < region->start + region->size) + size = region->size - (addr - region->start); + + /* Check how many 1GB huge pages can be filtered out: */ + while (size > PUD_SIZE && max_gb_huge_pages) { + size -= PUD_SIZE; + max_gb_huge_pages--; + i++; + } + + /* No good 1GB huge pages found: */ + if (!i) { + store_slot_info(region, image_size); + return; + } + + /* + * Skip those 'i'*1GB good huge pages, and continue checking and + * processing the remaining head or tail part of the passed region + * if available. + */ + + if (addr >= region->start + image_size) { + tmp.start = region->start; + tmp.size = addr - region->start; + store_slot_info(&tmp, image_size); + } + + size = region->size - (addr - region->start) - i * PUD_SIZE; + if (size >= image_size) { + tmp.start = addr + i * PUD_SIZE; + tmp.size = size; + store_slot_info(&tmp, image_size); + } +} + static unsigned long slots_fetch_random(void) { unsigned long slot; @@ -486,7 +577,6 @@ static void process_mem_region(struct mem_vector *entry, unsigned long image_size) { struct mem_vector region, overlap; - struct slot_area slot_area; unsigned long start_orig, end; struct mem_vector cur_entry; @@ -537,7 +627,7 @@ static void process_mem_region(struct mem_vector *entry, /* If nothing overlaps, store the region and return. */ if (!mem_avoid_overlap(®ion, &overlap)) { - store_slot_info(®ion, image_size); + process_gb_huge_pages(®ion, image_size); return; } @@ -547,7 +637,7 @@ static void process_mem_region(struct mem_vector *entry, beginning.start = region.start; beginning.size = overlap.start - region.start; - store_slot_info(&beginning, image_size); + process_gb_huge_pages(&beginning, image_size); } /* Return if overlap extends to or past end of region. */ @@ -723,6 +813,14 @@ void choose_random_location(unsigned long input, return; } +#ifdef CONFIG_X86_5LEVEL + if (__read_cr4() & X86_CR4_LA57) { + __pgtable_l5_enabled = 1; + pgdir_shift = 48; + ptrs_per_p4d = 512; + } +#endif + boot_params->hdr.loadflags |= KASLR_FLAG; /* Prepare to add new identity pagetables on demand. */ diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/kaslr_64.c index b5e5e02f8cde..748456c365f4 100644 --- a/arch/x86/boot/compressed/pagetable.c +++ b/arch/x86/boot/compressed/kaslr_64.c @@ -16,13 +16,6 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) -/* - * The pgtable.h and mm/ident_map.c includes make use of the SME related - * information which is not used in the compressed image support. Un-define - * the SME support to avoid any compile and link errors. - */ -#undef CONFIG_AMD_MEM_ENCRYPT - /* No PAGE_TABLE_ISOLATION support needed either: */ #undef CONFIG_PAGE_TABLE_ISOLATION @@ -76,6 +69,8 @@ static struct alloc_pgt_data pgt_data; /* The top level page table entry pointer. */ static unsigned long top_level_pgt; +phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; + /* * Mapping information structure passed to kernel_ident_mapping_init(). * Due to relocation, pointers must be assigned at run time not build time. @@ -85,13 +80,17 @@ static struct x86_mapping_info mapping_info; /* Locates and clears a region for a new top level page table. */ void initialize_identity_maps(void) { - unsigned long sev_me_mask = get_sev_encryption_mask(); + /* If running as an SEV guest, the encryption mask is required. */ + set_sev_encryption_mask(); + + /* Exclude the encryption mask from __PHYSICAL_MASK */ + physical_mask &= ~sme_me_mask; /* Init mapping_info with run-time function/buffer pointers. */ mapping_info.alloc_pgt_page = alloc_pgt_page; mapping_info.context = &pgt_data; - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask; - mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask; + mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; + mapping_info.kernpg_flag = _KERNPG_TABLE; /* * It should be impossible for this not to already be true, diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index 54f5f6625a73..a480356e0ed8 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -25,20 +25,6 @@ ENTRY(get_sev_encryption_bit) push %ebx push %ecx push %edx - push %edi - - /* - * RIP-relative addressing is needed to access the encryption bit - * variable. Since we are running in 32-bit mode we need this call/pop - * sequence to get the proper relative addressing. - */ - call 1f -1: popl %edi - subl $1b, %edi - - movl enc_bit(%edi), %eax - cmpl $0, %eax - jge .Lsev_exit /* Check if running under a hypervisor */ movl $1, %eax @@ -69,15 +55,12 @@ ENTRY(get_sev_encryption_bit) movl %ebx, %eax andl $0x3f, %eax /* Return the encryption bit location */ - movl %eax, enc_bit(%edi) jmp .Lsev_exit .Lno_sev: xor %eax, %eax - movl %eax, enc_bit(%edi) .Lsev_exit: - pop %edi pop %edx pop %ecx pop %ebx @@ -88,9 +71,7 @@ ENTRY(get_sev_encryption_bit) ENDPROC(get_sev_encryption_bit) .code64 -ENTRY(get_sev_encryption_mask) - xor %rax, %rax - +ENTRY(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp push %rdx @@ -101,9 +82,7 @@ ENTRY(get_sev_encryption_mask) testl %eax, %eax jz .Lno_sev_mask - xor %rdx, %rdx - bts %rax, %rdx /* Create the encryption mask */ - mov %rdx, %rax /* ... and return it */ + bts %rax, sme_me_mask(%rip) /* Create the encryption mask */ .Lno_sev_mask: movq %rbp, %rsp /* Restore original stack pointer */ @@ -112,9 +91,14 @@ ENTRY(get_sev_encryption_mask) pop %rbp #endif + xor %rax, %rax ret -ENDPROC(get_sev_encryption_mask) +ENDPROC(set_sev_encryption_mask) .data -enc_bit: - .int 0xffffffff + +#ifdef CONFIG_AMD_MEM_ENCRYPT + .balign 8 +GLOBAL(sme_me_mask) + .quad 0 +#endif diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 252fee320816..8dd1d5ccae58 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include "misc.h" #include "error.h" +#include "pgtable.h" #include "../string.h" #include "../voffset.h" @@ -169,16 +170,6 @@ void __puthex(unsigned long value) } } -static bool l5_supported(void) -{ - /* Check if leaf 7 is supported. */ - if (native_cpuid_eax(0) < 7) - return 0; - - /* Check if la57 is supported. */ - return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)); -} - #if CONFIG_X86_NEED_RELOCS static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) @@ -376,12 +367,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, console_init(); debug_putstr("early console in extract_kernel\n"); - if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) { - error("This linux kernel as configured requires 5-level paging\n" - "This CPU does not support the required 'cr4.la57' feature\n" - "Unable to boot - please use a kernel appropriate for your CPU\n"); - } - free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; @@ -392,6 +377,11 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, debug_putaddr(output_len); debug_putaddr(kernel_total_size); +#ifdef CONFIG_X86_64 + /* Report address of 32-bit trampoline */ + debug_putaddr(trampoline_32bit); +#endif + /* * The memory hole needed for the kernel is the larger of either * the entire decompressed kernel plus relocation table, or the diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 9d323dc6b159..a1d5918765f3 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -9,9 +9,13 @@ * paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_XXL #undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + #include <linux/linkage.h> #include <linux/screen_info.h> #include <linux/elf.h> @@ -109,6 +113,6 @@ static inline void console_init(void) { } #endif -unsigned long get_sev_encryption_mask(void); +void set_sev_encryption_mask(void); #endif diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h new file mode 100644 index 000000000000..91f75638f6e6 --- /dev/null +++ b/arch/x86/boot/compressed/pgtable.h @@ -0,0 +1,20 @@ +#ifndef BOOT_COMPRESSED_PAGETABLE_H +#define BOOT_COMPRESSED_PAGETABLE_H + +#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) + +#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 + +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE +#define TRAMPOLINE_32BIT_CODE_SIZE 0x60 + +#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE + +#ifndef __ASSEMBLER__ + +extern unsigned long *trampoline_32bit; + +extern void trampoline_32bit_src(void *return_ptr); + +#endif /* __ASSEMBLER__ */ +#endif /* BOOT_COMPRESSED_PAGETABLE_H */ diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index b4469a37e9a1..9e2157371491 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,4 +1,7 @@ +#include <asm/e820/types.h> #include <asm/processor.h> +#include "pgtable.h" +#include "../string.h" /* * __force_order is used by special_insns.h asm code to force instruction @@ -9,20 +12,180 @@ */ unsigned long __force_order; -int l5_paging_required(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +struct paging_config { + unsigned long trampoline_start; + unsigned long l5_required; +}; + +/* Buffer to preserve trampoline memory */ +static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + +/* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. + * + * Avoid putting the pointer into .bss as it will be cleared between + * paging_prepare() and extract_kernel(). + */ +unsigned long *trampoline_32bit __section(.data); + +extern struct boot_params *boot_params; +int cmdline_find_option_bool(const char *option); + +static unsigned long find_trampoline_placement(void) +{ + unsigned long bios_start, ebda_start; + unsigned long trampoline_start; + struct boot_e820_entry *entry; + int i; + + /* + * Find a suitable spot for the trampoline. + * This code is based on reserve_bios_regions(). + */ + + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; + + if (ebda_start > BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; + + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Find the first usable memory region under bios_start. */ + for (i = boot_params->e820_entries - 1; i >= 0; i--) { + entry = &boot_params->e820_table[i]; + + /* Skip all entries above bios_start. */ + if (bios_start <= entry->addr) + continue; + + /* Skip non-RAM entries. */ + if (entry->type != E820_TYPE_RAM) + continue; + + /* Adjust bios_start to the end of the entry if needed. */ + if (bios_start > entry->addr + entry->size) + bios_start = entry->addr + entry->size; + + /* Keep bios_start page-aligned. */ + bios_start = round_down(bios_start, PAGE_SIZE); + + /* Skip the entry if it's too small. */ + if (bios_start - TRAMPOLINE_32BIT_SIZE < entry->addr) + continue; + + break; + } + + /* Place the trampoline just below the end of low memory */ + return bios_start - TRAMPOLINE_32BIT_SIZE; +} + +struct paging_config paging_prepare(void *rmode) { - /* Check if leaf 7 is supported. */ + struct paging_config paging_config = {}; + + /* Initialize boot_params. Required for cmdline_find_option_bool(). */ + boot_params = rmode; + + /* + * Check if LA57 is desired and supported. + * + * There are several parts to the check: + * - if the kernel supports 5-level paging: CONFIG_X86_5LEVEL=y + * - if user asked to disable 5-level paging: no5lvl in cmdline + * - if the machine supports 5-level paging: + * + CPUID leaf 7 is supported + * + the leaf has the feature bit set + * + * That's substitute for boot_cpu_has() in early boot code. + */ + if (IS_ENABLED(CONFIG_X86_5LEVEL) && + !cmdline_find_option_bool("no5lvl") && + native_cpuid_eax(0) >= 7 && + (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { + paging_config.l5_required = 1; + } + + paging_config.trampoline_start = find_trampoline_placement(); + + trampoline_32bit = (unsigned long *)paging_config.trampoline_start; + + /* Preserve trampoline memory */ + memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); - if (native_cpuid_eax(0) < 7) - return 0; + /* Clear trampoline memory first */ + memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); + + /* Copy trampoline code in place */ + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), + &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); + + /* + * The code below prepares page table in trampoline memory. + * + * The new page table will be used by trampoline code for switching + * from 4- to 5-level paging or vice versa. + * + * If switching is not required, the page table is unused: trampoline + * code wouldn't touch CR3. + */ + + /* + * We are not going to use the page table in trampoline memory if we + * are already in the desired paging mode. + */ + if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57)) + goto out; + + if (paging_config.l5_required) { + /* + * For 4- to 5-level paging transition, set up current CR3 as + * the first and the only entry in a new top-level page table. + */ + trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; + } else { + unsigned long src; + + /* + * For 5- to 4-level paging transition, copy page table pointed + * by first entry in the current top-level page table as our + * new top-level page table. + * + * We cannot just point to the page table from trampoline as it + * may be above 4G. + */ + src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; + memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), + (void *)src, PAGE_SIZE); + } + +out: + return paging_config; +} + +void cleanup_trampoline(void *pgtable) +{ + void *trampoline_pgtable; - /* Check if la57 is supported. */ - if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) - return 0; + trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long); - /* Check if 5-level paging has already been enabled. */ - if (native_read_cr4() & X86_CR4_LA57) - return 0; + /* + * Move the top level page table out of trampoline memory, + * if it's there. + */ + if ((void *)__native_read_cr3() == trampoline_pgtable) { + memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); + native_write_cr3((unsigned long)pgtable); + } - return 1; + /* Restore trampoline memory */ + memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); } |

