diff options
Diffstat (limited to 'arch/x86/kernel')
43 files changed, 907 insertions, 1058 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bb8529275aab..8215e5652d97 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -35,7 +35,6 @@ obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-y += trampoline.o trampoline_$(BITS).o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -48,7 +47,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o -obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_PCI) += early-quirks.o @@ -100,6 +98,7 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_UPROBES) += uprobes.o ### # 64 bit specific files diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 6f35260bb3ef..163b22581472 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,14 +1,7 @@ -subdir- := realmode - obj-$(CONFIG_ACPI) += boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o endif -$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin - -$(obj)/realmode/wakeup.bin: FORCE - $(Q)$(MAKE) $(build)=$(obj)/realmode - diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore deleted file mode 100644 index 58f1f48a58f8..000000000000 --- a/arch/x86/kernel/acpi/realmode/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -wakeup.bin -wakeup.elf -wakeup.lds diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile deleted file mode 100644 index 6a564ac67ef5..000000000000 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -# -# arch/x86/kernel/acpi/realmode/Makefile -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -always := wakeup.bin -targets := wakeup.elf wakeup.lds - -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o - -# The link order of the video-*.o modules can matter. In particular, -# video-vga.o *must* be listed first, followed by video-vesa.o. -# Hardware-specific drivers should follow in the order they should be -# probed, and video-bios.o should typically be last. -wakeup-y += video-vga.o -wakeup-y += video-vesa.o -wakeup-y += video-bios.o - -targets += $(wakeup-y) - -bootsrc := $(src)/../../../boot - -# --------------------------------------------------------------------------- - -# How to compile the 16-bit code. Note we always compile for -march=i386, -# that way we can complain to the user if the CPU is insufficient. -# Compile with _SETUP since this is similar to the boot-time setup code. -KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \ - -I$(srctree)/$(bootsrc) \ - $(cflags-y) \ - -Wall -Wstrict-prototypes \ - -march=i386 -mregparm=3 \ - -include $(srctree)/$(bootsrc)/code16gcc.h \ - -fno-strict-aliasing -fomit-frame-pointer \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-toplevel-reorder,\ - $(call cc-option, -fno-unit-at-a-time)) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) -KBUILD_CFLAGS += $(call cc-option, -m32) -KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -GCOV_PROFILE := n - -WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y)) - -LDFLAGS_wakeup.elf := -T - -CPPFLAGS_wakeup.lds += -P -C - -$(obj)/wakeup.elf: $(obj)/wakeup.lds $(WAKEUP_OBJS) FORCE - $(call if_changed,ld) - -OBJCOPYFLAGS_wakeup.bin := -O binary - -$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE - $(call if_changed,objcopy) diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S deleted file mode 100644 index f51eb0bb56ce..000000000000 --- a/arch/x86/kernel/acpi/realmode/bioscall.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S deleted file mode 100644 index dc59ebee69d8..000000000000 --- a/arch/x86/kernel/acpi/realmode/copy.S +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/copy.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c deleted file mode 100644 index 6206033ba202..000000000000 --- a/arch/x86/kernel/acpi/realmode/regs.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/regs.c" diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c deleted file mode 100644 index 7deabc144a27..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-bios.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-bios.c" diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c deleted file mode 100644 index 328ad209f113..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-mode.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-mode.c" diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c deleted file mode 100644 index 9dbb9672226a..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-vesa.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vesa.c" diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c deleted file mode 100644 index bcc81255f374..000000000000 --- a/arch/x86/kernel/acpi/realmode/video-vga.c +++ /dev/null @@ -1 +0,0 @@ -#include "../../../boot/video-vga.c" diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c deleted file mode 100644 index 883962d9eef2..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakemain.c +++ /dev/null @@ -1,81 +0,0 @@ -#include "wakeup.h" -#include "boot.h" - -static void udelay(int loops) -{ - while (loops--) - io_delay(); /* Approximately 1 us */ -} - -static void beep(unsigned int hz) -{ - u8 enable; - - if (!hz) { - enable = 0x00; /* Turn off speaker */ - } else { - u16 div = 1193181/hz; - - outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */ - io_delay(); - outb(div, 0x42); /* LSB of counter */ - io_delay(); - outb(div >> 8, 0x42); /* MSB of counter */ - io_delay(); - - enable = 0x03; /* Turn on speaker */ - } - inb(0x61); /* Dummy read of System Control Port B */ - io_delay(); - outb(enable, 0x61); /* Enable timer 2 output to speaker */ - io_delay(); -} - -#define DOT_HZ 880 -#define DASH_HZ 587 -#define US_PER_DOT 125000 - -/* Okay, this is totally silly, but it's kind of fun. */ -static void send_morse(const char *pattern) -{ - char s; - - while ((s = *pattern++)) { - switch (s) { - case '.': - beep(DOT_HZ); - udelay(US_PER_DOT); - beep(0); - udelay(US_PER_DOT); - break; - case '-': - beep(DASH_HZ); - udelay(US_PER_DOT * 3); - beep(0); - udelay(US_PER_DOT); - break; - default: /* Assume it's a space */ - udelay(US_PER_DOT * 3); - break; - } - } -} - -void main(void) -{ - /* Kill machine if structures are wrong */ - if (wakeup_header.real_magic != 0x12345678) - while (1); - - if (wakeup_header.realmode_flags & 4) - send_morse("...-"); - - if (wakeup_header.realmode_flags & 1) - asm volatile("lcallw $0xc000,$3"); - - if (wakeup_header.realmode_flags & 2) { - /* Need to call BIOS */ - probe_cards(0); - set_mode(wakeup_header.video_mode); - } -} diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S deleted file mode 100644 index b4fd836e4053..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ /dev/null @@ -1,170 +0,0 @@ -/* - * ACPI wakeup real mode startup stub - */ -#include <asm/segment.h> -#include <asm/msr-index.h> -#include <asm/page_types.h> -#include <asm/pgtable_types.h> -#include <asm/processor-flags.h> -#include "wakeup.h" - - .code16 - .section ".jump", "ax" - .globl _start -_start: - cli - jmp wakeup_code - -/* This should match the structure in wakeup.h */ - .section ".header", "a" - .globl wakeup_header -wakeup_header: -video_mode: .short 0 /* Video mode number */ -pmode_return: .byte 0x66, 0xea /* ljmpl */ - .long 0 /* offset goes here */ - .short __KERNEL_CS -pmode_cr0: .long 0 /* Saved %cr0 */ -pmode_cr3: .long 0 /* Saved %cr3 */ -pmode_cr4: .long 0 /* Saved %cr4 */ -pmode_efer: .quad 0 /* Saved EFER */ -pmode_gdt: .quad 0 -pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */ -pmode_behavior: .long 0 /* Wakeup behavior flags */ -realmode_flags: .long 0 -real_magic: .long 0 -trampoline_segment: .word 0 -_pad1: .byte 0 -wakeup_jmp: .byte 0xea /* ljmpw */ -wakeup_jmp_off: .word 3f -wakeup_jmp_seg: .word 0 -wakeup_gdt: .quad 0, 0, 0 -signature: .long WAKEUP_HEADER_SIGNATURE - - .text - .code16 -wakeup_code: - cld - - /* Apparently some dimwit BIOS programmers don't know how to - program a PM to RM transition, and we might end up here with - junk in the data segment descriptor registers. The only way - to repair that is to go into PM and fix it ourselves... */ - movw $16, %cx - lgdtl %cs:wakeup_gdt - movl %cr0, %eax - orb $X86_CR0_PE, %al - movl %eax, %cr0 - jmp 1f -1: ljmpw $8, $2f -2: - movw %cx, %ds - movw %cx, %es - movw %cx, %ss - movw %cx, %fs - movw %cx, %gs - - andb $~X86_CR0_PE, %al - movl %eax, %cr0 - jmp wakeup_jmp -3: - /* Set up segments */ - movw %cs, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %ss - lidtl wakeup_idt - - movl $wakeup_stack_end, %esp - - /* Clear the EFLAGS */ - pushl $0 - popfl - - /* Check header signature... */ - movl signature, %eax - cmpl $WAKEUP_HEADER_SIGNATURE, %eax - jne bogus_real_magic - - /* Check we really have everything... */ - movl end_signature, %eax - cmpl $WAKEUP_END_SIGNATURE, %eax - jne bogus_real_magic - - /* Call the C code */ - calll main - - /* Restore MISC_ENABLE before entering protected mode, in case - BIOS decided to clear XD_DISABLE during S3. */ - movl pmode_behavior, %eax - btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax - jnc 1f - - movl pmode_misc_en, %eax - movl pmode_misc_en + 4, %edx - movl $MSR_IA32_MISC_ENABLE, %ecx - wrmsr -1: - - /* Do any other stuff... */ - -#ifndef CONFIG_64BIT - /* This could also be done in C code... */ - movl pmode_cr3, %eax - movl %eax, %cr3 - - movl pmode_cr4, %ecx - jecxz 1f - movl %ecx, %cr4 -1: - movl pmode_efer, %eax - movl pmode_efer + 4, %edx - movl %eax, %ecx - orl %edx, %ecx - jz 1f - movl $MSR_EFER, %ecx - wrmsr -1: - - lgdtl pmode_gdt - - /* This really couldn't... */ - movl pmode_cr0, %eax - movl %eax, %cr0 - jmp pmode_return -#else - pushw $0 - pushw trampoline_segment - pushw $0 - lret -#endif - -bogus_real_magic: -1: - hlt - jmp 1b - - .data - .balign 8 - - /* This is the standard real-mode IDT */ -wakeup_idt: - .word 0xffff /* limit */ - .long 0 /* address */ - .word 0 - - .globl HEAP, heap_end -HEAP: - .long wakeup_heap -heap_end: - .long wakeup_stack - - .bss -wakeup_heap: - .space 2048 -wakeup_stack: - .space 2048 -wakeup_stack_end: - - .section ".signature","a" -end_signature: - .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h deleted file mode 100644 index 97a29e1430e3..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Definitions for the wakeup data structure at the head of the - * wakeup code. - */ - -#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H -#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H - -#ifndef __ASSEMBLY__ -#include <linux/types.h> - -/* This must match data at wakeup.S */ -struct wakeup_header { - u16 video_mode; /* Video mode number */ - u16 _jmp1; /* ljmpl opcode, 32-bit only */ - u32 pmode_entry; /* Protected mode resume point, 32-bit only */ - u16 _jmp2; /* CS value, 32-bit only */ - u32 pmode_cr0; /* Protected mode cr0 */ - u32 pmode_cr3; /* Protected mode cr3 */ - u32 pmode_cr4; /* Protected mode cr4 */ - u32 pmode_efer_low; /* Protected mode EFER */ - u32 pmode_efer_high; - u64 pmode_gdt; - u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */ - u32 pmode_misc_en_high; - u32 pmode_behavior; /* Wakeup routine behavior flags */ - u32 realmode_flags; - u32 real_magic; - u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ - u8 _pad1; - u8 wakeup_jmp; - u16 wakeup_jmp_off; - u16 wakeup_jmp_seg; - u64 wakeup_gdt[3]; - u32 signature; /* To check we have correct structure */ -} __attribute__((__packed__)); - -extern struct wakeup_header wakeup_header; -#endif - -#define WAKEUP_HEADER_OFFSET 8 -#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 -#define WAKEUP_END_SIGNATURE 0x65a22c82 - -/* Wakeup behavior bits */ -#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0 - -#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S deleted file mode 100644 index d4f8010a5b1b..000000000000 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ /dev/null @@ -1,62 +0,0 @@ -/* - * wakeup.ld - * - * Linker script for the real-mode wakeup code - */ -#undef i386 -#include "wakeup.h" - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) - -SECTIONS -{ - . = 0; - .jump : { - *(.jump) - } = 0x90909090 - - . = WAKEUP_HEADER_OFFSET; - .header : { - *(.header) - } - - . = ALIGN(16); - .text : { - *(.text*) - } = 0x90909090 - - . = ALIGN(16); - .rodata : { - *(.rodata*) - } - - .videocards : { - video_cards = .; - *(.videocards) - video_cards_end = .; - } - - . = ALIGN(16); - .data : { - *(.data*) - } - - . = ALIGN(16); - .bss : { - __bss_start = .; - *(.bss) - __bss_end = .; - } - - .signature : { - *(.signature) - } - - _end = .; - - /DISCARD/ : { - *(.note*) - } -} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 146a49c763a4..95bf99de9058 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -14,8 +14,9 @@ #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> +#include <asm/realmode.h> -#include "realmode/wakeup.h" +#include "../../realmode/rm/wakeup.h" #include "sleep.h" unsigned long acpi_realmode_flags; @@ -36,13 +37,9 @@ asmlinkage void acpi_enter_s3(void) */ int acpi_suspend_lowlevel(void) { - struct wakeup_header *header; - /* address in low memory of the wakeup routine. */ - char *acpi_realmode; + struct wakeup_header *header = + (struct wakeup_header *) __va(real_mode_header->wakeup_header); - acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - - header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; @@ -50,27 +47,6 @@ int acpi_suspend_lowlevel(void) header->video_mode = saved_video_mode; - header->wakeup_jmp_seg = acpi_wakeup_address >> 4; - - /* - * Set up the wakeup GDT. We set these up as Big Real Mode, - * that is, with limits set to 4 GB. At least the Lenovo - * Thinkpad X61 is known to need this for the video BIOS - * initialization quirk to work; this is likely to also - * be the case for other laptops or integrated video devices. - */ - - /* GDT[0]: GDT self-pointer */ - header->wakeup_gdt[0] = - (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)__pa(&header->wakeup_gdt) << 16); - /* GDT[1]: big real mode-like code segment */ - header->wakeup_gdt[1] = - GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); - /* GDT[2]: big real mode-like data segment */ - header->wakeup_gdt[2] = - GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff); - #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); @@ -95,7 +71,6 @@ int acpi_suspend_lowlevel(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index d68677a2a010..5653a5791ec9 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -2,8 +2,8 @@ * Variables and functions used by the code in sleep.c */ -#include <asm/trampoline.h> #include <linux/linkage.h> +#include <asm/realmode.h> extern unsigned long saved_video_mode; extern long saved_magic; diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S deleted file mode 100644 index 63b8ab524f2c..000000000000 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Wrapper script for the realmode binary as a transport object - * before copying to low memory. - */ -#include <asm/page_types.h> - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .globl acpi_wakeup_code -acpi_wakeup_code: - .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" - .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 507ea58688e2..cd8b166a1735 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -42,7 +42,8 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) struct mce m; /* Only corrected MC is reported */ - if (!corrected) + if (!corrected || !(mem_err->validation_bits & + CPER_MEM_VALID_PHYSICAL_ADDRESS)) return; mce_setup(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 0c82091b1652..413c2ced887c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -126,6 +126,16 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), USER ), + MCESEV( + KEEP, "HT thread notices Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", @@ -165,15 +175,19 @@ static struct severity { }; /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. */ static int error_context(struct mce *m) { - if (m->mcgstatus & MCG_STATUS_EIPV) - return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; - /* Unknown, assume kernel */ - return IN_KERNEL; + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } int mce_severity(struct mce *m, int tolerant, char **msg) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2afcbd253e1d..0a687fd185e6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -437,6 +437,14 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { m->ip = regs->ip; m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; } /* Use accurate RIP reporting if available. */ if (rip_msr) @@ -641,16 +649,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp) { - int i; + int i, ret = 0; for (i = 0; i < banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + if (m->status & MCI_STATUS_VAL) + __set_bit(i, validp); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + ret = 1; } - return 0; + return ret; } /* @@ -1013,6 +1023,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; atomic_inc(&mce_entry); @@ -1027,7 +1038,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks); barrier(); @@ -1047,6 +1059,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; @@ -1237,15 +1251,15 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mce_start_timer(unsigned long data) +static void mce_timer_fn(unsigned long data) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; WARN_ON(smp_processor_id() != data); @@ -1258,13 +1272,14 @@ static void mce_start_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); + iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - *n = max(*n/2, HZ/100); + iv = max(iv, (unsigned long) HZ/100); else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + __this_cpu_write(mce_next_interval, iv); - t->expires = jiffies + *n; + t->expires = jiffies + iv; add_timer_on(t, smp_processor_id()); } @@ -1458,9 +1473,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) rdmsrl(msrs[i], val); /* CntP bit set? */ - if (val & BIT(62)) { - val &= ~BIT(62); - wrmsrl(msrs[i], val); + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); } } @@ -1542,17 +1557,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); + unsigned long iv = __this_cpu_read(mce_next_interval); - setup_timer(t, mce_start_timer, smp_processor_id()); + setup_timer(t, mce_timer_fn, smp_processor_id()); if (mce_ignore_ce) return; - *n = check_interval * HZ; - if (!*n) + __this_cpu_write(mce_next_interval, iv); + if (!iv) return; - t->expires = round_jiffies(jiffies + *n); + t->expires = round_jiffies(jiffies + iv); add_timer_on(t, smp_processor_id()); } @@ -2262,7 +2277,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED_FROZEN: if (!mce_ignore_ce && check_interval) { t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); + per_cpu(mce_next_interval, cpu)); add_timer_on(t, cpu); } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ac140c7be396..bdda2e6c673b 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -266,7 +266,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, if (align > max_align) align = max_align; - sizek = 1 << align; + sizek = 1UL << align; if (debug_print) { char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 62d61e9976eb..41857970517f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -113,7 +113,9 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, int x = e820x->nr_map; if (x >= ARRAY_SIZE(e820x->map)) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long) start, + (unsigned long long) (start + size - 1)); return; } @@ -133,19 +135,19 @@ static void __init e820_print_type(u32 type) switch (type) { case E820_RAM: case E820_RESERVED_KERN: - printk(KERN_CONT "(usable)"); + printk(KERN_CONT "usable"); break; case E820_RESERVED: - printk(KERN_CONT "(reserved)"); + printk(KERN_CONT "reserved"); break; case E820_ACPI: - printk(KERN_CONT "(ACPI data)"); + printk(KERN_CONT "ACPI data"); break; case E820_NVS: - printk(KERN_CONT "(ACPI NVS)"); + printk(KERN_CONT "ACPI NVS"); break; case E820_UNUSABLE: - printk(KERN_CONT "(unusable)"); + printk(KERN_CONT "unusable"); break; default: printk(KERN_CONT "type %u", type); @@ -158,10 +160,10 @@ void __init e820_print_map(char *who) int i; for (i = 0; i < e820.nr_map; i++) { - printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, (unsigned long long) e820.map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size)); + (e820.map[i].addr + e820.map[i].size - 1)); e820_print_type(e820.map[i].type); printk(KERN_CONT "\n"); } @@ -428,9 +430,8 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", - (unsigned long long) start, - (unsigned long long) end); + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); e820_print_type(old_type); printk(KERN_CONT " ==> "); e820_print_type(new_type); @@ -509,9 +510,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, size = ULLONG_MAX - start; end = start + size; - printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", - (unsigned long long) start, - (unsigned long long) end); + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); if (checktype) e820_print_type(old_type); printk(KERN_CONT "\n"); @@ -567,7 +567,7 @@ void __init update_e820(void) if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) return; e820.nr_map = nr_map; - printk(KERN_INFO "modified physical RAM map:\n"); + printk(KERN_INFO "e820: modified physical RAM map:\n"); e820_print_map("modified"); } static void __init update_e820_saved(void) @@ -637,8 +637,8 @@ __init void e820_setup_gap(void) if (!found) { gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; printk(KERN_ERR - "PCI: Warning: Cannot find a gap in the 32bit address range\n" - "PCI: Unassigned devices with 32bit resource registers may break!\n"); + "e820: cannot find a gap in the 32bit address range\n" + "e820: PCI devices with unassigned 32bit BARs may break!\n"); } #endif @@ -648,8 +648,8 @@ __init void e820_setup_gap(void) pci_mem_start = gapstart; printk(KERN_INFO - "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + "e820: [mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); } /** @@ -667,7 +667,7 @@ void __init parse_e820_ext(struct setup_data *sdata) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - printk(KERN_INFO "extended physical RAM map:\n"); + printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } @@ -734,7 +734,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); if (addr) { e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); update_e820_saved(); } @@ -784,7 +784,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) if (last_pfn > max_arch_pfn) last_pfn = max_arch_pfn; - printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", + printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); return last_pfn; } @@ -888,7 +888,7 @@ void __init finish_e820_parsing(void) early_panic("Invalid user supplied memory map"); e820.nr_map = nr; - printk(KERN_INFO "user-defined physical RAM map:\n"); + printk(KERN_INFO "e820: user-defined physical RAM map:\n"); e820_print_map("user"); } } @@ -996,8 +996,9 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; - printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", - start, end); + printk(KERN_DEBUG + "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } @@ -1047,7 +1048,7 @@ void __init setup_memory_map(void) who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); e820_print_map(who); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 51ff18616d50..c18f59d10101 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -14,7 +14,6 @@ #include <asm/sections.h> #include <asm/e820.h> #include <asm/page.h> -#include <asm/trampoline.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 3a3b779f41d3..037df57a99ac 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,7 +24,6 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> -#include <asm/trampoline.h> #include <asm/bios_ebda.h> static void __init zap_identity_mappings(void) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 463c9797ca6a..d42ab17b7397 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -274,10 +274,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * If cpu hotplug is not supported then this code can go in init section * which will be freed later */ - __CPUINIT - -#ifdef CONFIG_SMP ENTRY(startup_32_smp) cld movl $(__BOOT_DS),%eax @@ -288,7 +285,7 @@ ENTRY(startup_32_smp) movl pa(stack_start),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp -#endif /* CONFIG_SMP */ + default_entry: /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 7a40f2447321..94bf9cc2c7ee 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -139,10 +139,6 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) - /* Fixup trampoline */ - addq %rbp, trampoline_level4_pgt + 0(%rip) - addq %rbp, trampoline_level4_pgt + (511*8)(%rip) - /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being * filled with 0x90 (nop) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad0de0c2714e..1460a5df92f7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -94,13 +94,18 @@ static int hpet_verbose; static int __init hpet_setup(char *str) { - if (str) { + while (str) { + char *next = strchr(str, ','); + + if (next) + *next++ = 0; if (!strncmp("disable", str, 7)) boot_hpet_disable = 1; if (!strncmp("force", str, 5)) hpet_force_user = 1; if (!strncmp("verbose", str, 7)) hpet_verbose = 1; + str = next; } return 1; } @@ -319,8 +324,6 @@ static void hpet_set_mode(enum clock_event_mode mode, now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); - /* Make sure we use edge triggered interrupts */ - cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); @@ -787,15 +790,16 @@ static int hpet_clocksource_register(void) return 0; } +static u32 *hpet_boot_cfg; + /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { - unsigned long hpet_period; - unsigned int id; + u32 hpet_period, cfg, id; u64 freq; - int i; + unsigned int i, last; if (!is_hpet_capable()) return 0; @@ -847,15 +851,45 @@ int __init hpet_enable(void) id = hpet_readl(HPET_ID); hpet_print_config(); + last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + #ifdef CONFIG_HPET_EMULATE_RTC /* * The legacy routing mode needs at least two channels, tick timer * and the rtc emulation channel. */ - if (!(id & HPET_ID_NUMBER)) + if (!last) goto out_nohpet; #endif + cfg = hpet_readl(HPET_CFG); + hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), + GFP_KERNEL); + if (hpet_boot_cfg) + *hpet_boot_cfg = cfg; + else + pr_warn("HPET initial state will not be saved\n"); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + if (cfg) + pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", + cfg); + + for (i = 0; i <= last; ++i) { + cfg = hpet_readl(HPET_Tn_CFG(i)); + if (hpet_boot_cfg) + hpet_boot_cfg[i + 1] = cfg; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(i)); + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP + | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE + | HPET_TN_FSB | HPET_TN_FSB_CAP); + if (cfg) + pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", + cfg, i); + } + hpet_print_config(); + if (hpet_clocksource_register()) goto out_nohpet; @@ -923,14 +957,28 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable() && hpet_virt_address) { - unsigned int cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG), id, last; - if (hpet_legacy_int_enabled) { + if (hpet_boot_cfg) + cfg = *hpet_boot_cfg; + else if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; hpet_legacy_int_enabled = 0; } cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); + + if (!hpet_boot_cfg) + return; + + id = hpet_readl(HPET_ID); + last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + + for (id = 0; id <= last; ++id) + hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id)); + + if (*hpet_boot_cfg & HPET_CFG_ENABLE) + hpet_writel(*hpet_boot_cfg, HPET_CFG); } } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f8492da65bfc..086eb58c6e80 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,7 @@ #include <asm/msr.h> #include <asm/apic.h> #include <linux/percpu.h> +#include <linux/hardirq.h> #include <asm/x86_init.h> #include <asm/reboot.h> @@ -114,6 +115,25 @@ static void kvm_get_preset_lpj(void) preset_lpj = lpj; } +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + + /* + * per_cpu() is safe here because this function is only called from + * timer functions where preemption is already disabled. + */ + WARN_ON(!in_atomic()); + src = &__get_cpu_var(hv_clock); + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + ret = true; + } + + return ret; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b02d4dd6b8a3..d2b56489d70f 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,7 +27,6 @@ #include <asm/proto.h> #include <asm/bios_ebda.h> #include <asm/e820.h> -#include <asm/trampoline.h> #include <asm/setup.h> #include <asm/smp.h> @@ -568,8 +567,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) struct mpf_intel *mpf; unsigned long mem; - apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", - bp, length); + apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", + base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -584,8 +583,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) #endif mpf_found = mpf; - printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", - mpf, (u64)virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", + (unsigned long long) virt_to_phys(mpf), + (unsigned long long) virt_to_phys(mpf) + + sizeof(*mpf) - 1, mpf); mem = virt_to_phys(mpf); memblock_reserve(mem, sizeof(*mpf)); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 3003250ac51d..62c9457ccd2f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,14 +100,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page; + struct page *page = NULL; + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; dma_mask = dma_alloc_coherent_mask(dev, flag); flag |= __GFP_ZERO; again: - page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!(flag & GFP_ATOMIC)) + page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (!page) + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); if (!page) return NULL; @@ -127,6 +131,16 @@ again: return page_address(page); } +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, struct dma_attrs *attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct page *page = virt_to_page(vaddr); + + if (!dma_release_from_contiguous(dev, page, count)) + free_pages((unsigned long)vaddr, get_order(size)); +} + /* * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel * parameter documentation. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index f96050685b46..871be4a84c7d 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -74,12 +74,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} - static void nommu_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) @@ -97,7 +91,7 @@ static void nommu_sync_sg_for_device(struct device *dev, struct dma_map_ops nommu_dma_ops = { .alloc = dma_generic_alloc_coherent, - .free = nommu_free_coherent, + .free = dma_generic_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, .sync_single_for_device = nommu_sync_single_for_device, diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 77215c23fba1..79c45af81604 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,6 +24,7 @@ #ifdef CONFIG_X86_32 # include <linux/ctype.h> # include <linux/mc146818rtc.h> +# include <asm/realmode.h> #else # include <asm/x86_init.h> #endif @@ -156,15 +157,10 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } -extern const unsigned char machine_real_restart_asm[]; -extern const u64 machine_real_restart_gdt[3]; - void machine_real_restart(unsigned int type) { - void *restart_va; - unsigned long restart_pa; - void (*restart_lowmem)(unsigned int); - u64 *lowmem_gdt; + void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) + real_mode_header->machine_real_restart_asm; local_irq_disable(); @@ -195,21 +191,6 @@ void machine_real_restart(unsigned int type) * too. */ *((unsigned short *)0x472) = reboot_mode; - /* Patch the GDT in the low memory trampoline */ - lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); - - restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); - restart_pa = virt_to_phys(restart_va); - restart_lowmem = (void (*)(unsigned int))restart_pa; - - /* GDT[0]: GDT self-pointer */ - lowmem_gdt[0] = - (u64)(sizeof(machine_real_restart_gdt) - 1) + - ((u64)virt_to_phys(lowmem_gdt) << 16); - /* GDT[1]: 64K real mode code segment */ - lowmem_gdt[1] = - GDT_ENTRY(0x009b, restart_pa, 0xffff); - /* Jump to the identity-mapped low memory code */ restart_lowmem(type); } diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S deleted file mode 100644 index 1d5c46df0d78..000000000000 --- a/arch/x86/kernel/reboot_32.S +++ /dev/null @@ -1,135 +0,0 @@ -#include <linux/linkage.h> -#include <linux/init.h> -#include <asm/segment.h> -#include <asm/page_types.h> - -/* - * The following code and data reboots the machine by switching to real - * mode and jumping to the BIOS reset entry point, as if the CPU has - * really been reset. The previous version asked the keyboard - * controller to pulse the CPU reset line, which is more thorough, but - * doesn't work with at least one type of 486 motherboard. It is easy - * to stop this code working; hence the copious comments. - * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. - */ - .section ".x86_trampoline","a" - .balign 16 - .code32 -ENTRY(machine_real_restart_asm) -r_base = . - /* Get our own relocated address */ - call 1f -1: popl %ebx - subl $(1b - r_base), %ebx - - /* Compute the equivalent real-mode segment */ - movl %ebx, %ecx - shrl $4, %ecx - - /* Patch post-real-mode segment jump */ - movw (dispatch_table - r_base)(%ebx,%eax,2),%ax - movw %ax, (101f - r_base)(%ebx) - movw %cx, (102f - r_base)(%ebx) - - /* Set up the IDT for real mode. */ - lidtl (machine_real_restart_idt - r_base)(%ebx) - - /* - * Set up a GDT from which we can load segment descriptors for real - * mode. The GDT is not used in real mode; it is just needed here to - * prepare the descriptors. - */ - lgdtl (machine_real_restart_gdt - r_base)(%ebx) - - /* - * Load the data segment registers with 16-bit compatible values - */ - movl $16, %ecx - movl %ecx, %ds - movl %ecx, %es - movl %ecx, %fs - movl %ecx, %gs - movl %ecx, %ss - ljmpl $8, $1f - r_base - -/* - * This is 16-bit protected mode code to disable paging and the cache, - * switch to real mode and jump to the BIOS reset code. - * - * The instruction that switches to real mode by writing to CR0 must be - * followed immediately by a far jump instruction, which set CS to a - * valid value for real mode, and flushes the prefetch queue to avoid - * running instructions that have already been decoded in protected - * mode. - * - * Clears all the flags except ET, especially PG (paging), PE - * (protected-mode enable) and TS (task switch for coprocessor state - * save). Flushes the TLB after paging has been disabled. Sets CD and - * NW, to disable the cache on a 486, and invalidates the cache. This - * is more like the state of a 486 after reset. I don't know if - * something else should be done for other chips. - * - * More could be done here to set up the registers as if a CPU reset had - * occurred; hopefully real BIOSs don't assume much. This is not the - * actual BIOS entry point, anyway (that is at 0xfffffff0). - * - * Most of this work is probably excessive, but it is what is tested. - */ - .code16 -1: - xorl %ecx, %ecx - movl %cr0, %eax - andl $0x00000011, %eax - orl $0x60000000, %eax - movl %eax, %cr0 - movl %ecx, %cr3 - movl %cr0, %edx - andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ - jz 2f - wbinvd -2: - andb $0x10, %al - movl %eax, %cr0 - .byte 0xea /* ljmpw */ -101: .word 0 /* Offset */ -102: .word 0 /* Segment */ - -bios: - ljmpw $0xf000, $0xfff0 - -apm: - movw $0x1000, %ax - movw %ax, %ss - movw $0xf000, %sp - movw $0x5307, %ax - movw $0x0001, %bx - movw $0x0003, %cx - int $0x15 - -END(machine_real_restart_asm) - - .balign 16 - /* These must match <asm/reboot.h */ -dispatch_table: - .word bios - r_base - .word apm - r_base -END(dispatch_table) - - .balign 16 -machine_real_restart_idt: - .word 0xffff /* Length - real mode default value */ - .long 0 /* Base - real mode default value */ -END(machine_real_restart_idt) - - .balign 16 -ENTRY(machine_real_restart_gdt) - .quad 0 /* Self-pointer, filled in by PM code */ - .quad 0 /* 16-bit code segment, filled in by PM code */ - /* - * 16-bit data segment with the selector value 16 = 0x10 and - * base value 0x100; since this is consistent with real mode - * semantics we don't have to reload the segments once CR0.PE = 0. - */ - .quad GDT_ENTRY(0x0093, 0x100, 0xffff) -END(machine_real_restart_gdt) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 366c688d619e..16be6dc14db1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -49,6 +49,7 @@ #include <asm/pci-direct.h> #include <linux/init_ohci1394_dma.h> #include <linux/kvm_para.h> +#include <linux/dma-contiguous.h> #include <linux/errno.h> #include <linux/kernel.h> @@ -72,7 +73,7 @@ #include <asm/mtrr.h> #include <asm/apic.h> -#include <asm/trampoline.h> +#include <asm/realmode.h> #include <asm/e820.h> #include <asm/mpspec.h> #include <asm/setup.h> @@ -333,8 +334,8 @@ static void __init relocate_initrd(void) memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; - printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", - ramdisk_here, ramdisk_here + ramdisk_size); + printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", + ramdisk_here, ramdisk_here + ramdisk_size - 1); q = (char *)initrd_start; @@ -365,8 +366,8 @@ static void __init relocate_initrd(void) /* high pages is not converted by early_res_to_bootmem */ ramdisk_image = boot_params.hdr.ramdisk_image; ramdisk_size = boot_params.hdr.ramdisk_size; - printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to" - " %08llx - %08llx\n", + printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" + " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, ramdisk_here, ramdisk_here + ramdisk_size - 1); } @@ -391,8 +392,8 @@ static void __init reserve_initrd(void) ramdisk_size, end_of_lowmem>>1); } - printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image, - ramdisk_end); + printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, + ramdisk_end - 1); if (ramdisk_end <= end_of_lowmem) { @@ -905,10 +906,10 @@ void __init setup_arch(char **cmdline_p) setup_bios_corruption_check(); #endif - printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", - max_pfn_mapped<<PAGE_SHIFT); + printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", + (max_pfn_mapped<<PAGE_SHIFT) - 1); - setup_trampolines(); + setup_real_mode(); init_gbpages(); @@ -925,6 +926,7 @@ void __init setup_arch(char **cmdline_p) } #endif memblock.current_limit = get_max_mapped(); + dma_contiguous_reserve(0); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. @@ -966,6 +968,8 @@ void __init setup_arch(char **cmdline_p) if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ mmu_cr4_features = read_cr4(); + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9363b58b967c..2e937a5ad531 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -18,6 +18,7 @@ #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/user-return-notifier.h> +#include <linux/uprobes.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -814,6 +815,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + if (thread_info_flags & _TIF_UPROBE) { + clear_thread_flag(TIF_UPROBE); + uprobe_notify_resume(regs); + } + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 433529e29be4..f56f96da77f5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -57,7 +57,7 @@ #include <asm/nmi.h> #include <asm/irq.h> #include <asm/idle.h> -#include <asm/trampoline.h> +#include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> #include <asm/pgtable.h> @@ -73,6 +73,8 @@ #include <asm/smpboot_hooks.h> #include <asm/i8259.h> +#include <asm/realmode.h> + /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -660,8 +662,12 @@ static void __cpuinit announce_cpu(int cpu, int apicid) */ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) { + volatile u32 *trampoline_status = + (volatile u32 *) __va(real_mode_header->trampoline_status); + /* start_ip had better be page-aligned! */ + unsigned long start_ip = real_mode_header->trampoline_start; + unsigned long boot_error = 0; - unsigned long start_ip; int timeout; alternatives_smp_switch(1); @@ -684,9 +690,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; - /* start_ip had better be page-aligned! */ - start_ip = trampoline_address(); - /* So we see what's up */ announce_cpu(cpu, apicid); @@ -749,8 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) pr_debug("CPU%d: has booted.\n", cpu); } else { boot_error = 1; - if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) - == 0xA5A5A5A5) + if (*trampoline_status == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -776,7 +778,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) } /* mark "stuck" area as not stuck */ - *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; + *trampoline_status = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 6410744ac5cb..f84fe00fad48 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -32,7 +32,7 @@ #include <linux/mm.h> #include <linux/tboot.h> -#include <asm/trampoline.h> +#include <asm/realmode.h> #include <asm/processor.h> #include <asm/bootparam.h> #include <asm/pgtable.h> @@ -44,7 +44,7 @@ #include <asm/e820.h> #include <asm/io.h> -#include "acpi/realmode/wakeup.h" +#include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; @@ -201,7 +201,8 @@ static int tboot_setup_sleep(void) add_mac_region(e820.map[i].addr, e820.map[i].size); } - tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + tboot->acpi_sinfo.kernel_s3_resume_vector = + real_mode_header->wakeup_start; return 0; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c deleted file mode 100644 index a73b61055ad6..000000000000 --- a/arch/x86/kernel/trampoline.c +++ /dev/null @@ -1,42 +0,0 @@ -#include <linux/io.h> -#include <linux/memblock.h> - -#include <asm/trampoline.h> -#include <asm/cacheflush.h> -#include <asm/pgtable.h> - -unsigned char *x86_trampoline_base; - -void __init setup_trampolines(void) -{ - phys_addr_t mem; - size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); - - /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (!mem) - panic("Cannot allocate trampoline\n"); - - x86_trampoline_base = __va(mem); - memblock_reserve(mem, size); - - printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", - x86_trampoline_base, (unsigned long long)mem, size); - - memcpy(x86_trampoline_base, x86_trampoline_start, size); -} - -/* - * setup_trampolines() gets called very early, to guarantee the - * availability of low memory. This is before the proper kernel page - * tables are set up, so we cannot set page permissions in that - * function. Thus, we use an arch_initcall instead. - */ -static int __init configure_trampolines(void) -{ - size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); - - set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); - return 0; -} -arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S deleted file mode 100644 index 451c0a7ef7fd..000000000000 --- a/arch/x86/kernel/trampoline_32.S +++ /dev/null @@ -1,83 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * - * This is only used for booting secondary CPUs in SMP machine - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * We jump into arch/x86/kernel/head_32.S. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * If you work on this file, check the object module with - * objdump --reloc to make sure there are no relocation - * entries except for: - * - * TYPE VALUE - * R_386_32 startup_32_smp - * R_386_32 boot_gdt - */ - -#include <linux/linkage.h> -#include <linux/init.h> -#include <asm/segment.h> -#include <asm/page_types.h> - -#ifdef CONFIG_SMP - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .code16 - -ENTRY(trampoline_data) -r_base = . - wbinvd # Needed for NUMA-Q should be harmless for others - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - - cli # We should be safe anyway - - movl $0xA5A5A5A5, trampoline_status - r_base - # write marker for master knows we're running - - /* GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl boot_idt_descr - r_base # load idt with 0, 0 - lgdtl boot_gdt_descr - r_base # load gdt with whatever is appropriate - - xor %ax, %ax - inc %ax # protected mode (PE) bit - lmsw %ax # into protected mode - # flush prefetch and jump to startup_32_smp in arch/i386/kernel/head.S - ljmpl $__BOOT_CS, $(startup_32_smp-__PAGE_OFFSET) - - # These need to be in the same 64K segment as the above; - # hence we don't use the boot_gdt_descr defined in head.S -boot_gdt_descr: - .word __BOOT_DS + 7 # gdt limit - .long boot_gdt - __PAGE_OFFSET # gdt base - -boot_idt_descr: - .word 0 # idt limit = 0 - .long 0 # idt base = 0L - -ENTRY(trampoline_status) - .long 0 - -.globl trampoline_end -trampoline_end: - -#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S deleted file mode 100644 index 09ff51799e96..000000000000 --- a/arch/x86/kernel/trampoline_64.S +++ /dev/null @@ -1,171 +0,0 @@ -/* - * - * Trampoline.S Derived from Setup.S by Linus Torvalds - * - * 4 Jan 1997 Michael Chastain: changed to gnu as. - * 15 Sept 2005 Eric Biederman: 64bit PIC support - * - * Entry: CS:IP point to the start of our code, we are - * in real mode with no stack, but the rest of the - * trampoline page to make our stack and everything else - * is a mystery. - * - * On entry to trampoline_data, the processor is in real mode - * with 16-bit addressing and 16-bit data. CS has some value - * and IP is zero. Thus, data addresses need to be absolute - * (no relocation) and are taken with regard to r_base. - * - * With the addition of trampoline_level4_pgt this code can - * now enter a 64bit kernel that lives at arbitrary 64bit - * physical addresses. - * - * If you work on this file, check the object module with objdump - * --full-contents --reloc to make sure there are no relocation - * entries. - */ - -#include <linux/linkage.h> -#include <linux/init.h> -#include <asm/pgtable_types.h> -#include <asm/page_types.h> -#include <asm/msr.h> -#include <asm/segment.h> -#include <asm/processor-flags.h> - - .section ".x86_trampoline","a" - .balign PAGE_SIZE - .code16 - -ENTRY(trampoline_data) -r_base = . - cli # We should be safe anyway - wbinvd - mov %cs, %ax # Code and data in the same place - mov %ax, %ds - mov %ax, %es - mov %ax, %ss - - - movl $0xA5A5A5A5, trampoline_status - r_base - # write marker for master knows we're running - - # Setup stack - movw $(trampoline_stack_end - r_base), %sp - - call verify_cpu # Verify the cpu supports long mode - testl %eax, %eax # Check for return code - jnz no_longmode - - mov %cs, %ax - movzx %ax, %esi # Find the 32bit trampoline location - shll $4, %esi - - # Fixup the absolute vectors - leal (startup_32 - r_base)(%esi), %eax - movl %eax, startup_32_vector - r_base - leal (startup_64 - r_base)(%esi), %eax - movl %eax, startup_64_vector - r_base - leal (tgdt - r_base)(%esi), %eax - movl %eax, (tgdt + 2 - r_base) - - /* - * GDT tables in non default location kernel can be beyond 16MB and - * lgdt will not be able to load the address as in real mode default - * operand size is 16bit. Use lgdtl instead to force operand size - * to 32 bit. - */ - - lidtl tidt - r_base # load idt with 0, 0 - lgdtl tgdt - r_base # load gdt with whatever is appropriate - - mov $X86_CR0_PE, %ax # protected mode (PE) bit - lmsw %ax # into protected mode - - # flush prefetch and jump to startup_32 - ljmpl *(startup_32_vector - r_base) - - .code32 - .balign 4 -startup_32: - movl $__KERNEL_DS, %eax # Initialize the %ds segment register - movl %eax, %ds - - movl $X86_CR4_PAE, %eax - movl %eax, %cr4 # Enable PAE mode - - # Setup trampoline 4 level pagetables - leal (trampoline_level4_pgt - r_base)(%esi), %eax - movl %eax, %cr3 - - movl $MSR_EFER, %ecx - movl $(1 << _EFER_LME), %eax # Enable Long Mode - xorl %edx, %edx - wrmsr - - # Enable paging and in turn activate Long Mode - # Enable protected mode - movl $(X86_CR0_PG | X86_CR0_PE), %eax - movl %eax, %cr0 - - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. - */ - ljmp *(startup_64_vector - r_base)(%esi) - - .code64 - .balign 4 -startup_64: - # Now jump into the kernel using virtual addresses - movq $secondary_startup_64, %rax - jmp *%rax - - .code16 -no_longmode: - hlt - jmp no_longmode -#include "verify_cpu.S" - - .balign 4 - # Careful these need to be in the same 64K segment as the above; -tidt: - .word 0 # idt limit = 0 - .word 0, 0 # idt base = 0L - - # Duplicate the global descriptor table - # so the kernel can live anywhere - .balign 4 -tgdt: - .short tgdt_end - tgdt # gdt limit - .long tgdt - r_base - .short 0 - .quad 0x00cf9b000000ffff # __KERNEL32_CS - .quad 0x00af9b000000ffff # __KERNEL_CS - .quad 0x00cf93000000ffff # __KERNEL_DS -tgdt_end: - - .balign 4 -startup_32_vector: - .long startup_32 - r_base - .word __KERNEL32_CS, 0 - - .balign 4 -startup_64_vector: - .long startup_64 - r_base - .word __KERNEL_CS, 0 - - .balign 4 -ENTRY(trampoline_status) - .long 0 - -trampoline_stack: - .org 0x1000 -trampoline_stack_end: -ENTRY(trampoline_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - -ENTRY(trampoline_end) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c new file mode 100644 index 000000000000..dc4e910a7d96 --- /dev/null +++ b/arch/x86/kernel/uprobes.c @@ -0,0 +1,674 @@ +/* + * User-space Probes (UProbes) for x86 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <linux/uaccess.h> + +#include <linux/kdebug.h> +#include <asm/processor.h> +#include <asm/insn.h> + +/* Post-execution fixups. */ + +/* No fixup needed */ +#define UPROBE_FIX_NONE 0x0 + +/* Adjust IP back to vicinity of actual insn */ +#define UPROBE_FIX_IP 0x1 + +/* Adjust the return address of a call insn */ +#define UPROBE_FIX_CALL 0x2 + +#define UPROBE_FIX_RIP_AX 0x8000 +#define UPROBE_FIX_RIP_CX 0x4000 + +#define UPROBE_TRAP_NR UINT_MAX + +/* Adaptations for mhiramat x86 decoder v14. */ +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) +#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + +/* + * Good-instruction tables for 32-bit apps. This is non-const and volatile + * to keep gcc from statically optimizing it out, as variable_test_bit makes + * some versions of gcc to think only *(unsigned long*) is used. + */ +static volatile u32 good_insns_32[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; + +#ifdef CONFIG_X86_64 +/* Good-instruction tables for 64-bit apps */ +static volatile u32 good_insns_64[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#endif +#undef W + +/* + * opcodes we'll probably never support: + * + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * invalid opcodes in 64-bit mode: + * + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 + * 63 - we support this opcode in x86_64 but not in i386. + * + * opcodes we may need to refine support for: + * + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- + * but 64 and 65 (fs: and gs:) seem to be used, so we support them + * 67 - addr16 prefix + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + */ + +static bool is_prefix_bad(struct insn *insn) +{ + int i; + + for (i = 0; i < insn->prefixes.nbytes; i++) { + switch (insn->prefixes.bytes[i]) { + case 0x26: /* INAT_PFX_ES */ + case 0x2E: /* INAT_PFX_CS */ + case 0x36: /* INAT_PFX_DS */ + case 0x3E: /* INAT_PFX_SS */ + case 0xF0: /* INAT_PFX_LOCK */ + return true; + } + } + return false; +} + +static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) +{ + insn_init(insn, auprobe->insn, false); + + /* Skip good instruction prefixes; reject "bad" ones. */ + insn_get_opcode(insn); + if (is_prefix_bad(insn)) + return -ENOTSUPP; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) + return 0; + + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + + return -ENOTSUPP; +} + +/* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, and + * annotate arch_uprobe->fixups accordingly. To start with, + * arch_uprobe->fixups is either zero or it reflects rip-related fixups. + */ +static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) +{ + bool fix_ip = true, fix_call = false; /* defaults */ + int reg; + + insn_get_opcode(insn); /* should be a nop */ + + switch (OPCODE1(insn)) { + case 0xc3: /* ret/lret */ + case 0xcb: + case 0xc2: + case 0xca: + /* ip is correct */ + fix_ip = false; + break; + case 0xe8: /* call relative - Fix return addr */ + fix_call = true; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xff: + insn_get_modrm(insn); + reg = MODRM_REG(insn); + if (reg == 2 || reg == 3) { + /* call or lcall, indirect */ + /* Fix return addr; ip is correct. */ + fix_call = true; + fix_ip = false; + } else if (reg == 4 || reg == 5) { + /* jmp or ljmp, indirect */ + /* ip is correct. */ + fix_ip = false; + } + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + default: + break; + } + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; +} + +#ifdef CONFIG_X86_64 +/* + * If arch_uprobe->insn doesn't use rip-relative addressing, return + * immediately. Otherwise, rewrite the instruction so that it accesses + * its memory operand indirectly through a scratch register. Set + * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address + * accordingly. (The contents of the scratch register will be saved + * before we single-step the modified instruction, and restored + * afterward.) + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the XOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't execute the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * + * - There's always a modrm byte. + * - There's never a SIB byte. + * - The displacement is always 4 bytes. + */ +static void +handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + u8 *cursor; + u8 reg; + + if (mm->context.ia32_compat) + return; + + auprobe->rip_rela_target_address = 0x0; + if (!insn_rip_relative(insn)) + return; + + /* + * insn_rip_relative() would have decoded rex_prefix, modrm. + * Clear REX.b bit (extension of MODRM.rm field): + * we want to encode rax/rcx, not r8/r9. + */ + if (insn->rex_prefix.nbytes) { + cursor = auprobe->insn + insn_offset_rex_prefix(insn); + *cursor &= 0xfe; /* Clearing REX.B bit */ + } + + /* + * Point cursor at the modrm byte. The next 4 bytes are the + * displacement. Beyond the displacement, for some instructions, + * is the immediate operand. + */ + cursor = auprobe->insn + insn_offset_modrm(insn); + insn_get_length(insn); + + /* + * Convert from rip-relative addressing to indirect addressing + * via a scratch register. Change the r/m field from 0x5 (%rip) + * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. + */ + reg = MODRM_REG(insn); + if (reg == 0) { + /* + * The register operand (if any) is either the A register + * (%rax, %eax, etc.) or (if the 0x4 bit is set in the + * REX prefix) %r8. In any case, we know the C register + * is NOT the register operand, so we use %rcx (register + * #1) for the scratch register. + */ + auprobe->fixups = UPROBE_FIX_RIP_CX; + /* Change modrm from 00 000 101 to 00 000 001. */ + *cursor = 0x1; + } else { + /* Use %rax (register #0) for the scratch register. */ + auprobe->fixups = UPROBE_FIX_RIP_AX; + /* Change modrm from 00 xxx 101 to 00 xxx 000 */ + *cursor = (reg << 3); + } + + /* Target address = address of next instruction + (signed) offset */ + auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; + + /* Displacement field is gone; slide immediate field (if any) over. */ + if (insn->immediate.nbytes) { + cursor++; + memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); + } + return; +} + +static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) +{ + insn_init(insn, auprobe->insn, true); + + /* Skip good instruction prefixes; reject "bad" ones. */ + insn_get_opcode(insn); + if (is_prefix_bad(insn)) + return -ENOTSUPP; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) + return 0; + + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + return -ENOTSUPP; +} + +static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + if (mm->context.ia32_compat) + return validate_insn_32bits(auprobe, insn); + return validate_insn_64bits(auprobe, insn); +} +#else /* 32-bit: */ +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + /* No RIP-relative addressing on 32-bit */ +} + +static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +{ + return validate_insn_32bits(auprobe, insn); +} +#endif /* CONFIG_X86_64 */ + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +{ + int ret; + struct insn insn; + + auprobe->fixups = 0; + ret = validate_insn_bits(auprobe, mm, &insn); + if (ret != 0) + return ret; + + handle_riprel_insn(auprobe, mm, &insn); + prepare_fixups(auprobe, &insn); + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} +#else +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + /* No RIP-relative addressing on 32-bit */ +} +#endif + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + autask->saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + regs->ip = current->utask->xol_vaddr; + pre_xol_rip_insn(auprobe, regs, autask); + + return 0; +} + +/* + * This function is called by arch_uprobe_post_xol() to adjust the return + * address pushed by a call instruction executed out of line. + */ +static int adjust_ret_addr(unsigned long sp, long correction) +{ + int rasize, ncopied; + long ra = 0; + + if (is_ia32_task()) + rasize = 4; + else + rasize = 8; + + ncopied = copy_from_user(&ra, (void __user *)sp, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + ra += correction; + ncopied = copy_to_user((void __user *)sp, &ra, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + return 0; +} + +#ifdef CONFIG_X86_64 +static bool is_riprel_insn(struct arch_uprobe *auprobe) +{ + return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (is_riprel_insn(auprobe)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Fall through to handle stuff like "jmpq *...(%rip)" and + * "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } +} +#else +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + /* No RIP-relative addressing on 32-bit */ +} +#endif + +/* + * If xol insn itself traps and generates a signal(Say, + * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped + * instruction jumps back to its own address. It is assumed that anything + * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. + * + * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, + * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to + * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.trap_nr != UPROBE_TRAP_NR) + return true; + + return false; +} + +/* + * Called after single-stepping. To avoid the SMP problems that can + * occur when we temporarily put back the original opcode to + * single-step, we single-stepped a copy of the instruction. + * + * This function prepares to resume execution after the single-step. + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,(%rax)". + * We need to restore the contents of the scratch register and adjust + * the ip, keeping in mind that the instruction we executed is 4 bytes + * shorter than the original instruction (since we squeezed out the offset + * field). (FIX_RIP_AX or FIX_RIP_CX) + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask; + long correction; + int result = 0; + + WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + utask = current->utask; + current->thread.trap_nr = utask->autask.saved_trap_nr; + correction = (long)(utask->vaddr - utask->xol_vaddr); + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + result = adjust_ret_addr(regs->sp, correction); + + return result; +} + +/* callback routine for handling exceptions. */ +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + /* We are only interested in userspace traps */ + if (regs && !user_mode_vm(regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_INT3: + if (uprobe_pre_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + break; + + case DIE_DEBUG: + if (uprobe_post_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + default: + break; + } + + return ret; +} + +/* + * This function gets called when XOL instruction either gets trapped or + * the thread has a fatal signal, so reset the instruction pointer to its + * probed address. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + current->thread.trap_nr = utask->autask.saved_trap_nr; + handle_riprel_post_xol(auprobe, regs, NULL); + instruction_pointer_set(regs, utask->vaddr); +} + +/* + * Skip these instructions as per the currently known x86 ISA. + * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } + */ +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + int i; + + for (i = 0; i < MAX_UINSN_BYTES; i++) { + if ((auprobe->insn[i] == 0x66)) + continue; + + if (auprobe->insn[i] == 0x90) + return true; + + if (i == (MAX_UINSN_BYTES - 1)) + break; + + if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f)) + return true; + + if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19)) + return true; + + if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0)) + return true; + + break; + } + return false; +} diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0f703f10901a..22a1530146a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -197,18 +197,6 @@ SECTIONS INIT_DATA_SECTION(16) - /* - * Code and data for a variety of lowlevel trampolines, to be - * copied into base memory (< 1 MiB) during initialization. - * Since it is copied early, the main copy can be discarded - * afterwards. - */ - .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { - x86_trampoline_start = .; - *(.x86_trampoline) - x86_trampoline_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) |