diff options
Diffstat (limited to 'arch/x86/kernel')
158 files changed, 11267 insertions, 10437 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 38573340b143..6f813009d44b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -1,9 +1,91 @@ -ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/kernel/Makefile_32 -else -include ${srctree}/arch/x86/kernel/Makefile_64 +# +# Makefile for the linux kernel. +# + +extra-y := head_$(BITS).o init_task.o vmlinux.lds +extra-$(CONFIG_X86_64) += head64.o + +CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +CFLAGS_vsyscall_64.o := $(PROFILING) -g0 + +obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o +obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += setup_$(BITS).o i8259_$(BITS).o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o +obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o +obj-y += quirks.o i8237.o topology.o kdebugfs.o +obj-y += alternative.o i8253.o +obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o +obj-y += tsc_$(BITS).o io_delay.o rtc.o + +obj-y += i387.o +obj-y += ptrace.o +obj-y += ds.o +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_IA32_EMULATION) += tls.o +obj-y += step.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += cpu/ +obj-y += acpi/ +obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o +obj-$(CONFIG_X86_64) += reboot.o +obj-$(CONFIG_MCA) += mca_32.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_PCI) += early-quirks.o +obj-$(CONFIG_APM) += apm_32.o +obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o +obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o +obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o +obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o +obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o +obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o +obj-$(CONFIG_X86_VSMP) += vsmp_64.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_ACPI_SRAT) += srat_32.o +obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_HPET_TIMER) += hpet.o + +obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o +obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o +obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o + +obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o + +ifdef CONFIG_INPUT_PCSPKR +obj-y += pcspeaker.o endif -# Workaround to delete .lds files with make clean -# The problem is that we do not enter Makefile_32 with make clean. -clean-files := vsyscall*.lds vsyscall*.so +obj-$(CONFIG_SCx200) += scx200_32.o + +### +# 64 bit specific files +ifeq ($(CONFIG_X86_64),y) + obj-y += genapic_64.o genapic_flat_64.o + obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o + obj-$(CONFIG_AUDIT) += audit_64.o + obj-$(CONFIG_PM) += suspend_64.o + obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o + + obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o + obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o + obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o +endif diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 deleted file mode 100644 index a7bc93c27662..000000000000 --- a/arch/x86/kernel/Makefile_32 +++ /dev/null @@ -1,88 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head_32.o init_task.o vmlinux.lds -CPPFLAGS_vmlinux.lds += -Ui386 - -obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \ - ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \ - pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\ - quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-y += cpu/ -obj-y += acpi/ -obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o -obj-$(CONFIG_MCA) += mca_32.o -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o -obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o -obj-$(CONFIG_SMP) += smpcommon_32.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o -obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o -obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o -obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o -obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o -obj-$(CONFIG_KPROBES) += kprobes_32.o -obj-$(CONFIG_MODULES) += module_32.o -obj-y += sysenter_32.o vsyscall_32.o -obj-$(CONFIG_ACPI_SRAT) += srat_32.o -obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -obj-$(CONFIG_VM86) += vm86_32.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o - -obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o -obj-$(CONFIG_PARAVIRT) += paravirt_32.o -obj-y += pcspeaker.o - -obj-$(CONFIG_SCx200) += scx200_32.o - -# vsyscall_32.o contains the vsyscall DSO images as __initdata. -# We must build both images before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so -targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so) -targets += vsyscall-note_32.o vsyscall_32.lds - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ - -export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386 - -vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) -SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags) -SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags) - -$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \ - $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE - $(call if_changed,syscall) - -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vsyscall-syms.o -$(obj)/built-in.o: $(obj)/vsyscall-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o - -SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ - $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE - $(call if_changed,syscall) - - diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64 deleted file mode 100644 index 5a88890d8ee9..000000000000 --- a/arch/x86/kernel/Makefile_64 +++ /dev/null @@ -1,45 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head_64.o head64.o init_task.o vmlinux.lds -CPPFLAGS_vmlinux.lds += -Ux86_64 -EXTRA_AFLAGS := -traditional - -obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \ - ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \ - x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \ - setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \ - pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \ - i8253.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-y += cpu/ -obj-y += acpi/ -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o -obj-y += apic_64.o nmi_64.o -obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o -obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o -obj-$(CONFIG_PM) += suspend_64.o -obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o -obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o -obj-$(CONFIG_KPROBES) += kprobes_64.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o -obj-$(CONFIG_X86_VSMP) += vsmp_64.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_AUDIT) += audit_64.o - -obj-$(CONFIG_MODULES) += module_64.o -obj-$(CONFIG_PCI) += early-quirks.o - -obj-y += topology.o -obj-y += pcspeaker.o - -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 1351c3982ee4..19d3d6e9d09b 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ACPI) += boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep_$(BITS).o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o processor.o diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c new file mode 100644 index 000000000000..6bc815cd8cb3 --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.c @@ -0,0 +1,87 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/dmi.h> +#include <linux/cpumask.h> + +#include <asm/smp.h> + +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_realmode_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ + if (!acpi_wakeup_address) { + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); + return -ENOMEM; + } + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); + + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem(void) +{ +} + + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { + printk(KERN_ERR + "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +} + + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c index 10699489cfe7..63fe5525e026 100644 --- a/arch/x86/kernel/acpi/sleep_32.c +++ b/arch/x86/kernel/acpi/sleep_32.c @@ -12,76 +12,6 @@ #include <asm/smp.h> -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - if (!acpi_wakeup_address) - return 1; - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_bootmem(void) -{ - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if (!acpi_wakeup_address) - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - /* Ouch, we want to delete this. We already have better version in userspace, in s2ram from suspend.sf.net project */ static __init int reset_videomode_after_s3(const struct dmi_system_id *d) diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c deleted file mode 100644 index da42de261ba8..000000000000 --- a/arch/x86/kernel/acpi/sleep_64.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/bootmem.h> -#include <linux/acpi.h> -#include <linux/cpumask.h> - -#include <asm/mpspec.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long acpi_copy_wakeup_routine(unsigned long); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) - printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt" - " to suspend\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 1e931aaf2ef6..f53e3277f8e5 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,4 +1,4 @@ -.text + .section .text.page_aligned #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 5ed3bc5c61d7..2e1b9e0d0767 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -344,13 +344,13 @@ do_suspend_lowlevel: call save_processor_state movq $saved_context, %rax - movq %rsp, pt_regs_rsp(%rax) - movq %rbp, pt_regs_rbp(%rax) - movq %rsi, pt_regs_rsi(%rax) - movq %rdi, pt_regs_rdi(%rax) - movq %rbx, pt_regs_rbx(%rax) - movq %rcx, pt_regs_rcx(%rax) - movq %rdx, pt_regs_rdx(%rax) + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) movq %r8, pt_regs_r8(%rax) movq %r9, pt_regs_r9(%rax) movq %r10, pt_regs_r10(%rax) @@ -360,7 +360,7 @@ do_suspend_lowlevel: movq %r14, pt_regs_r14(%rax) movq %r15, pt_regs_r15(%rax) pushfq - popq pt_regs_eflags(%rax) + popq pt_regs_flags(%rax) movq $.L97, saved_rip(%rip) @@ -391,15 +391,15 @@ do_suspend_lowlevel: movq %rbx, %cr2 movq saved_context_cr0(%rax), %rbx movq %rbx, %cr0 - pushq pt_regs_eflags(%rax) + pushq pt_regs_flags(%rax) popfq - movq pt_regs_rsp(%rax), %rsp - movq pt_regs_rbp(%rax), %rbp - movq pt_regs_rsi(%rax), %rsi - movq pt_regs_rdi(%rax), %rdi - movq pt_regs_rbx(%rax), %rbx - movq pt_regs_rcx(%rax), %rcx - movq pt_regs_rdx(%rax), %rdx + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx movq pt_regs_r8(%rax), %r8 movq pt_regs_r9(%rax), %r9 movq pt_regs_r10(%rax), %r10 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d6405e0842b5..45d79ea890ae 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -273,6 +273,7 @@ struct smp_alt_module { }; static LIST_HEAD(smp_alt_modules); static DEFINE_SPINLOCK(smp_alt); +static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp) #ifdef CONFIG_LOCKDEP /* - * A not yet fixed binutils section handling bug prevents - * alternatives-replacement from working reliably, so turn - * it off: + * Older binutils section handling bug prevented + * alternatives-replacement from working reliably. + * + * If this still occurs then you should see a hang + * or crash shortly after this line: */ - printk("lockdep: not fixing up alternatives.\n"); - return; + printk("lockdep: fixing up alternatives.\n"); #endif if (noreplace_smp || smp_alt_once) @@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp) BUG_ON(!smp && (num_online_cpus() > 1)); spin_lock_irqsave(&smp_alt, flags); - if (smp) { + + /* + * Avoid unnecessary switches because it forces JIT based VMs to + * throw away all cached translations, which can be quite costly. + */ + if (smp == smp_mode) { + /* nothing */ + } else if (smp) { printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); - clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); - set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); } + smp_mode = smp; spin_unlock_irqrestore(&smp_alt, flags); } @@ -431,8 +441,9 @@ void __init alternative_instructions(void) if (smp_alt_once) { if (1 == num_possible_cpus()) { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); - set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); + alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } @@ -440,7 +451,10 @@ void __init alternative_instructions(void) alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); - alternatives_smp_switch(0); + + /* Only switch to UP mode if we don't immediately boot others */ + if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + alternatives_smp_switch(0); } #endif apply_paravirt(__parainstructions, __parainstructions_end); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5b6992799c9d..608152a2a05e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,12 +1,12 @@ -/* +/* * Firmware replacement code. - * + * * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. - * If all fails map the aperture over some low memory. This is cheaper than - * doing bounce buffering. The memory is lost. This is done at early boot - * because only the bootmem allocator can allocate 32+MB. - * + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * * Copyright 2002 Andi Kleen, SuSE Labs. */ #include <linux/kernel.h> @@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0; int gart_iommu_aperture_allowed __initdata = 0; int fallback_aper_order __initdata = 1; /* 64MB */ -int fallback_aper_force __initdata = 0; +int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; @@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ -static u32 __init allocate_aperture(void) +static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + void *p; - if (fallback_aper_order > 7) - fallback_aper_order = 7; - aper_size = (32 * 1024 * 1024) << fallback_aper_order; + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; - /* - * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chance of finding a place in the lower 4GB of memory. - * Unfortunately we cannot move it up because that would make the - * IOMMU useless. + /* + * Aperture has to be naturally aligned. This means a 2GB aperture + * won't have much chance of finding a place in the lower 4GB of + * memory. Unfortunately we cannot move it up because that would + * make the IOMMU useless. */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { - printk("Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); + printk(KERN_ERR + "Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); if (p) free_bootmem(__pa(p), aper_size); return 0; } - printk("Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); + printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, __pa(p)); insert_aperture_resource((u32)__pa(p), aper_size); - return (u32)__pa(p); + + return (u32)__pa(p); } static int __init aperture_valid(u64 aper_base, u32 aper_size) -{ - if (!aper_base) - return 0; - if (aper_size < 64*1024*1024) { - printk("Aperture too small (%d MB)\n", aper_size>>20); +{ + if (!aper_base) return 0; - } + if (aper_base + aper_size > 0x100000000UL) { - printk("Aperture beyond 4GB. Ignoring.\n"); - return 0; + printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); + return 0; } if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture pointing to e820 RAM. Ignoring.\n"); - return 0; - } + printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + if (aper_size < 64*1024*1024) { + printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20); + return 0; + } + return 1; -} +} /* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) -{ - u8 pos; +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ int bytes; - if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + u8 pos; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) return 0; - pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { u8 id; - pos &= ~3; - id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); if (id == 0xff) break; - if (id == cap) - return pos; - pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); - } + if (id == cap) + return pos; + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } return 0; -} +} /* Read a standard AGPv3 bridge header */ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) -{ +{ u32 apsize; u32 apsizereg; int nbits; u32 aper_low, aper_hi; u64 aper; - printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk("APSIZE in AGP bridge unreadable\n"); + printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); return 0; } apsize = apsizereg & 0xfff; /* Some BIOS use weird encodings not in the AGPv3 table. */ - if (apsize & 0xff) - apsize |= 0xf00; + if (apsize & 0xff) + apsize |= 0xf00; nbits = hweight16(apsize); *order = 7 - nbits; if ((int)*order < 0) /* < 32MB */ *order = 0; - - aper_low = read_pci_config(num,slot,func, 0x10); - aper_hi = read_pci_config(num,slot,func,0x14); + + aper_low = read_pci_config(num, slot, func, 0x10); + aper_hi = read_pci_config(num, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); - printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order)) - return 0; - return (u32)aper; -} - -/* Look for an AGP bridge. Windows only expects the aperture in the - AGP bridge and some BIOS forget to initialize the Northbridge too. - Work around this here. - - Do an PCI bus scan by hand because we're running before the PCI - subsystem. + return 0; + return (u32)aper; +} - All K8 AGP bridges are AGPv3 compliant, so we can do this scan - generically. It's probably overkill to always scan all slots because - the AGP bridges should be always an own bus on the HT hierarchy, - but do it here for future safety. */ +/* + * Look for an AGP bridge. Windows only expects the aperture in the + * AGP bridge and some BIOS forget to initialize the Northbridge too. + * Work around this here. + * + * Do an PCI bus scan by hand because we're running before the PCI + * subsystem. + * + * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * generically. It's probably overkill to always scan all slots because + * the AGP bridges should be always an own bus on the HT hierarchy, + * but do it here for future safety. + */ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) { int num, slot, func; /* Poor man's PCI discovery */ - for (num = 0; num < 256; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { + for (num = 0; num < 256; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { u32 class, cap; u8 type; - class = read_pci_config(num,slot,func, + class = read_pci_config(num, slot, func, PCI_CLASS_REVISION); if (class == 0xffffffff) - break; - - switch (class >> 16) { + break; + + switch (class >> 16) { case PCI_CLASS_BRIDGE_HOST: case PCI_CLASS_BRIDGE_OTHER: /* needed? */ /* AGP bridge? */ - cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + cap = find_cap(num, slot, func, + PCI_CAP_ID_AGP); if (!cap) break; - *valid_agp = 1; - return read_agp(num,slot,func,cap,order); - } - + *valid_agp = 1; + return read_agp(num, slot, func, cap, + order); + } + /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, + type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) break; - } - } + } + } } - printk("No AGP bridge found\n"); + printk(KERN_INFO "No AGP bridge found\n"); + return 0; } +static int gart_fix_e820 __initdata = 1; + +static int __init parse_gart_mem(char *p) +{ + if (!p) + return -EINVAL; + + if (!strncmp(p, "off", 3)) + gart_fix_e820 = 0; + else if (!strncmp(p, "on", 2)) + gart_fix_e820 = 1; + + return 0; +} +early_param("gart_fix_e820", parse_gart_mem); + +void __init early_gart_iommu_check(void) +{ + /* + * in case it is enabled before, esp for kexec/kdump, + * previous kernel already enable that. memset called + * by allocate_aperture/__alloc_bootmem_nopanic cause restart. + * or second kernel have different position for GART hole. and new + * kernel could use hole as RAM that is still used by GART set by + * first kernel + * or BIOS forget to put that in reserved. + * try to update e820 to make that region as reserved. + */ + int fix, num; + u32 ctl; + u32 aper_size = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base = 0, last_aper_base = 0; + int aper_enabled = 0, last_aper_enabled = 0; + + if (!early_pci_allowed()) + return; + + fix = 0; + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + ctl = read_pci_config(0, num, 3, 0x90); + aper_enabled = ctl & 1; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base) || + (last_aper_enabled && aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; + } + + if (!fix && !aper_enabled) + return; + + if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL) + fix = 1; + + if (gart_fix_e820 && !fix && aper_enabled) { + if (e820_any_mapped(aper_base, aper_base + aper_size, + E820_RAM)) { + /* reserved it, so we can resuse it in second kernel */ + printk(KERN_INFO "update e820 for GART\n"); + add_memory_region(aper_base, aper_size, E820_RESERVED); + update_e820(); + } + return; + } + + /* different nodes have different setting, disable them all at first*/ + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + ctl = read_pci_config(0, num, 3, 0x90); + ctl &= ~1; + write_pci_config(0, num, 3, 0x90, ctl); + } + +} + void __init gart_iommu_hole_init(void) -{ - int fix, num; +{ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; - int valid_agp = 0; + int fix, num, valid_agp = 0; + int node; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) @@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void) printk(KERN_INFO "Checking aperture...\n"); fix = 0; - for (num = 24; num < 32; num++) { + node = 0; + for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) continue; iommu_detected = 1; gart_iommu_aperture = 1; - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; + aper_base <<= 25; + + printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", + node, aper_base, aper_size >> 20); + node++; - printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, - aper_base, aper_size>>20); - if (!aperture_valid(aper_base, aper_size)) { - fix = 1; - break; + fix = 1; + break; } if ((last_aper_order && aper_order != last_aper_order) || @@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void) } last_aper_order = aper_order; last_aper_base = aper_base; - } + } if (!fix && !fallback_aper_force) { if (last_aper_base) { unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); } - return; + return; } if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); - - if (aper_alloc) { + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { /* Got the aperture from the AGP bridge */ } else if (swiotlb && !valid_agp) { /* Do nothing */ } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || - fallback_aper_force) { - printk("Your BIOS doesn't leave a aperture memory hole\n"); - printk("Please enable the IOMMU option in the BIOS setup\n"); - printk("This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + fallback_aper_force) { + printk(KERN_ERR + "Your BIOS doesn't leave a aperture memory hole\n"); + printk(KERN_ERR + "Please enable the IOMMU option in the BIOS setup\n"); + printk(KERN_ERR + "This costs you %d MB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); - if (!aper_alloc) { - /* Could disable AGP and IOMMU here, but it's probably - not worth it. But the later users cannot deal with - bad apertures and turning on the aperture over memory - causes very strange problems, so it's better to - panic early. */ + if (!aper_alloc) { + /* + * Could disable AGP and IOMMU here, but it's + * probably not worth it. But the later users + * cannot deal with bad apertures and turning + * on the aperture over memory causes very + * strange problems, so it's better to panic + * early. + */ panic("Not enough memory for aperture"); } - } else { - return; - } + } else { + return; + } /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { + for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); - } -} + continue; + + /* + * Don't enable translation yet. That is done later. + * Assume this BIOS didn't initialise the GART so + * just overwrite all previous bits + */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index edb5108e5d0e..35a568ea8400 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -43,12 +43,10 @@ #include <mach_apicdef.h> #include <mach_ipi.h> -#include "io_ports.h" - /* * Sanity check */ -#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) # error SPURIOUS_APIC_VECTOR definition error #endif @@ -57,7 +55,7 @@ * * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; +static int enable_local_apic __initdata; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; @@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +static unsigned long apic_phys; + /* * Get the LAPIC version */ @@ -110,7 +110,7 @@ static inline int lapic_get_version(void) } /* - * Check, if the APIC is integrated or a seperate chip + * Check, if the APIC is integrated or a separate chip */ static inline int lapic_is_integrated(void) { @@ -135,9 +135,9 @@ void apic_wait_icr_idle(void) cpu_relax(); } -unsigned long safe_apic_wait_icr_idle(void) +u32 safe_apic_wait_icr_idle(void) { - unsigned long send_status; + u32 send_status; int timeout; timeout = 0; @@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void) /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ -void enable_NMI_through_LVT0 (void * dummy) +void __cpuinit enable_NMI_through_LVT0(void) { unsigned int v = APIC_DM_NMI; @@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void) */ if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; setup_APIC_timer(); + } return; } @@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void) "with PM Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ - res = (((u64) delta ) * pm_100ms); + res = (((u64) delta) * pm_100ms); do_div(res, deltapm); printk(KERN_INFO "APIC delta adjusted to PM-Timer: " "%lu (%ld)\n", (unsigned long) res, delta); @@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void) local_apic_timer_verify_ok = 1; + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void) return; } + /* + * the NMI deadlock-detector uses this. + */ per_cpu(irq_stat, cpu).apic_timer_irqs++; evt->event_handler(evt); @@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ - -void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +void smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier) */ void clear_local_APIC(void) { - int maxlvt = lapic_get_maxlvt(); - unsigned long v; + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + maxlvt = lapic_get_maxlvt(); /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void) value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (integrated && !esr_disable) { /* !82489DX */ + if (integrated && !esr_disable) { + /* !82489DX */ maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void) /* * Detect and initialize APIC */ -static int __init detect_init_APIC (void) +static int __init detect_init_APIC(void) { u32 h, l, features; @@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void) printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } - set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ @@ -1104,8 +1127,6 @@ no_apic: */ void __init init_apic_mappings(void) { - unsigned long apic_phys; - /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1164,10 +1185,10 @@ fake_ioapic_page: * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int __init APIC_init_uniprocessor (void) +int __init APIC_init_uniprocessor(void) { if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); if (!smp_found_config && !cpu_has_apic) return -1; @@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } @@ -1210,50 +1231,6 @@ int __init APIC_init_uniprocessor (void) } /* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); - -static int __init parse_nolapic(char *arg) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -early_param("nolapic", parse_nolapic); - -static int __init parse_disable_lapic_timer(char *arg) -{ - local_apic_timer_disabled = 1; - return 0; -} -early_param("nolapic_timer", parse_disable_lapic_timer); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} - -__setup("apic=", apic_set_verbosity); - - -/* * Local APIC interrupts */ @@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value = apic_read(APIC_LVT0); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); apic_write_around(APIC_LVT0, value); @@ -1530,7 +1507,7 @@ static int lapic_resume(struct sys_device *dev) */ static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), + .name = "lapic", .resume = lapic_resume, .suspend = lapic_suspend, }; @@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs); static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + enable_local_apic = -1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return 0; +} +early_param("nolapic", parse_nolapic); + +static int __init parse_disable_lapic_timer(char *arg) +{ + local_apic_timer_disabled = 1; + return 0; +} +early_param("nolapic_timer", parse_disable_lapic_timer); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; +} +__setup("apic=", apic_set_verbosity); + diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index f28ccb588fba..d8d03e09dea2 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -23,32 +23,37 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> -#include <linux/module.h> #include <linux/ioport.h> #include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> +#include <linux/module.h> #include <asm/atomic.h> #include <asm/smp.h> #include <asm/mtrr.h> #include <asm/mpspec.h> +#include <asm/hpet.h> #include <asm/pgalloc.h> #include <asm/mach_apic.h> #include <asm/nmi.h> #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> -#include <asm/hpet.h> #include <asm/apic.h> -int apic_verbosity; int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; +int disable_apic; -/* Local APIC timer works in C2? */ +/* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -static struct resource *ioapic_resources; +/* + * Debug level, exported for io_apic.c + */ +int apic_verbosity; + static struct resource lapic_resource = { .name = "Local APIC", .flags = IORESOURCE_MEM | IORESOURCE_BUSY, @@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); - static void lapic_timer_broadcast(cpumask_t mask); - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen); +static void apic_pm_activate(void); static struct clock_event_device lapic_clockevent = { .name = "lapic", @@ -78,6 +81,150 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +static unsigned long apic_phys; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return 1; +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 safe_apic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void __cpuinit enable_NMI_through_LVT0(void) +{ + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; + apic_write(APIC_LVT0, v); +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v, maxlvt; + + v = apic_read(APIC_LVR); + maxlvt = GET_APIC_MAXLVT(v); + return maxlvt; +} + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks); +} + +/* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} + +/* + * Program the next event, relative to now + */ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -85,6 +232,9 @@ static int lapic_next_event(unsigned long delta, return 0; } +/* + * Setup the lapic timer in periodic or oneshot mode + */ static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt) { @@ -127,75 +277,261 @@ static void lapic_timer_broadcast(cpumask_t mask) #endif } -static void apic_pm_activate(void); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); -void apic_wait_icr_idle(void) + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static void __init calibrate_APIC_clock(void) { - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = result / HZ; } -unsigned int safe_apic_wait_icr_idle(void) +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) { - unsigned int send_status; - int timeout; + /* + * The local apic timer can be disabled via the kernel commandline. + * Register the lapic timer as a dummy clock event source on SMP + * systems, so the broadcast mechanism is used. On UP systems simply + * ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); + printk(KERN_INFO "Using local APIC timer interrupts.\n"); + calibrate_APIC_clock(); - return send_status; + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); + + setup_APIC_timer(); } -void enable_NMI_through_LVT0 (void * dummy) +/* + * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the + * C1E flag only in the secondary CPU, so when we detect the wreckage + * we already have enabled the boot CPU local apic timer. Check, if + * disable_apic_timer is set and the DUMMY flag is cleared. If yes, + * set the DUMMY flag again and force the broadcast mode in the + * clockevents layer. + */ +void __cpuinit check_boot_apic_timer_broadcast(void) { - unsigned int v; + if (!disable_apic_timer || + (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) + return; - /* unmask and set to NMI */ - v = APIC_DM_NMI; - apic_write(APIC_LVT0, v); + printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); + lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; + + local_irq_enable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); + local_irq_disable(); } -int get_maxlvt(void) +void __cpuinit setup_secondary_APIC_clock(void) { - unsigned int v, maxlvt; + check_boot_apic_timer_broadcast(); + setup_APIC_timer(); +} - v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + evt->event_handler(evt); } /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] */ -void ack_bad_irq(unsigned int irq) +void smp_apic_timer_interrupt(struct pt_regs *regs) { - printk("unexpected IRQ trap at vector %02x\n", irq); + struct pt_regs *old_regs = set_irq_regs(regs); + /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. */ - if (!disable_apic) - ack_APIC_irq(); + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; } + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ void clear_local_APIC(void) { - int maxlvt; - unsigned int v; + int maxlvt = lapic_get_maxlvt(); + u32 v; - maxlvt = get_maxlvt(); + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + maxlvt = lapic_get_maxlvt(); /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -233,45 +569,9 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void disconnect_bsp_APIC(int virt_wire_setup) -{ - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - +/** + * disable_local_APIC - clear and disable the local APIC + */ void disable_local_APIC(void) { unsigned int value; @@ -333,7 +633,7 @@ int __init verify_local_APIC(void) reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; - reg1 = get_maxlvt(); + reg1 = lapic_get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; @@ -355,18 +655,20 @@ int __init verify_local_APIC(void) * compatibility mode, but most boxes are anymore. */ reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); reg1 = apic_read(APIC_LVT1); apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); return 1; } +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ void __init sync_Arb_IDs(void) { /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ + if (modern_apic()) return; /* @@ -418,9 +720,12 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } -void __cpuinit setup_local_APIC (void) +/** + * setup_local_APIC - setup the local APIC + */ +void __cpuinit setup_local_APIC(void) { - unsigned int value, maxlvt; + unsigned int value; int i, j; value = apic_read(APIC_LVR); @@ -516,30 +821,217 @@ void __cpuinit setup_local_APIC (void) else value = APIC_DM_NMI | APIC_LVT_MASKED; apic_write(APIC_LVT1, value); +} - { - unsigned oldvalue; - maxlvt = get_maxlvt(); - oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, - "ESR value after enabling vector: %08x, after %08x\n", - oldvalue, value); - } +void __cpuinit lapic_setup_esr(void) +{ + unsigned maxlvt = lapic_get_maxlvt(); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); +} +void __cpuinit end_local_APIC_setup(void) +{ + lapic_setup_esr(); nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_id = 0; + return 0; +} + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } + + verify_local_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); + + setup_local_APIC(); + + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + + end_local_APIC_setup(); + + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; + setup_boot_APIC_clock(); + check_nmi_watchdog(); + return 0; +} + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + exit_idle(); + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + add_pda(irq_spurious_count, 1); + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + exit_idle(); + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +void disconnect_bsp_APIC(int virt_wire_setup) +{ + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +/* + * Power management + */ #ifdef CONFIG_PM static struct { @@ -571,7 +1063,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) if (!apic_pm_state.active) return 0; - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); @@ -605,7 +1097,7 @@ static int lapic_resume(struct sys_device *dev) if (!apic_pm_state.active) return 0; - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); local_irq_save(flags); rdmsr(MSR_IA32_APICBASE, l, h); @@ -639,14 +1131,14 @@ static int lapic_resume(struct sys_device *dev) } static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), + .name = "lapic", .resume = lapic_resume, .suspend = lapic_suspend, }; static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, + .id = 0, + .cls = &lapic_sysclass, }; static void __cpuinit apic_pm_activate(void) @@ -657,9 +1149,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { int error; + if (!cpu_has_apic) return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + error = sysdev_class_register(&lapic_sysclass); if (!error) error = sysdev_register(&device_lapic); @@ -673,423 +1167,6 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ -static int __init apic_set_verbosity(char *str) -{ - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ - -static int __init detect_init_APIC (void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_id = 0; - return 0; -} - -#ifdef CONFIG_X86_IO_APIC -static struct resource * __init ioapic_setup_resources(void) -{ -#define IOAPIC_RESOURCE_NAME_SIZE 11 - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - memset(mem, 0, n); - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} - -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk("IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif - -void __init init_apic_mappings(void) -{ - unsigned long apic_phys; - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; - } else { - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } - } - } -} - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks); -} - -static void setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static void __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = result / HZ; -} - -void __init setup_boot_APIC_clock (void) -{ - /* - * The local apic timer can be disabled via the kernel commandline. - * Register the lapic timer as a dummy clock event source on SMP - * systems, so the broadcast mechanism is used. On UP systems simply - * ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=1!\n"); - - setup_APIC_timer(); -} - -/* - * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the - * C1E flag only in the secondary CPU, so when we detect the wreckage - * we already have enabled the boot CPU local apic timer. Check, if - * disable_apic_timer is set and the DUMMY flag is cleared. If yes, - * set the DUMMY flag again and force the broadcast mode in the - * clockevents layer. - */ -void __cpuinit check_boot_apic_timer_broadcast(void) -{ - if (!disable_apic_timer || - (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) - return; - - printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); - lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; - - local_irq_enable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); - local_irq_disable(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - check_boot_apic_timer_broadcast(); - setup_APIC_timer(); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) -{ - unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -void smp_local_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - /* * apic_is_clustered_box() -- Check if we can expect good TSC * @@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void) { int i, clusters, zeros; unsigned id; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + if (id != BAD_APICID) __set_bit(APIC_CLUSTERID(id), clustermap); } /* Problem: Partially populated chassis may not have CPUs in some of * the APIC clusters they have been allocated. Only present CPUs have - * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since - * clusters are allocated sequentially, count zeros only if they are - * bounded by ones. + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. */ clusters = 0; zeros = 0; @@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void) } /* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - exit_idle(); - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - add_pda(irq_spurious_count, 1); - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture + * APIC command line parameters */ - -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - exit_idle(); - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) +static int __init apic_set_verbosity(char *str) { - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", str); + return -EINVAL; } - verify_local_APIC(); - - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); - - setup_local_APIC(); - - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - setup_boot_APIC_clock(); - check_nmi_watchdog(); return 0; } +early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index af045ca0f653..d4438ef296d8 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -227,6 +227,7 @@ #include <linux/dmi.h> #include <linux/suspend.h> #include <linux/kthread.h> +#include <linux/jiffies.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -235,8 +236,6 @@ #include <asm/paravirt.h> #include <asm/reboot.h> -#include "io_ports.h" - #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); #endif @@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int); /* * Ignore suspend events for this amount of time after a resume */ -#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) /* * Maximum number of events stored @@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int); */ struct apm_user { int magic; - struct apm_user * next; + struct apm_user *next; unsigned int suser: 1; unsigned int writer: 1; unsigned int reader: 1; @@ -372,44 +371,44 @@ struct apm_user { static struct { unsigned long offset; unsigned short segment; -} apm_bios_entry; -static int clock_slowed; -static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; -static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; -static int suspends_pending; -static int standbys_pending; -static int ignore_sys_suspend; -static int ignore_normal_resume; -static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; - -static int debug __read_mostly; -static int smp __read_mostly; -static int apm_disabled = -1; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int set_pm_idle; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static int debug __read_mostly; +static int smp __read_mostly; +static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static int power_off; #else -static int power_off = 1; +static int power_off = 1; #endif #ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; +static int realmode_power_off = 1; #else -static int realmode_power_off; +static int realmode_power_off; #endif #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static int allow_ints = 1; #else -static int allow_ints; +static int allow_ints; #endif -static int broken_psr; +static int broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); -static struct apm_user * user_list; +static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; +static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; -static const char driver_version[] = "1.16ac"; /* no spaces */ +static const char driver_version[] = "1.16ac"; /* no spaces */ static struct task_struct *kapmd_task; @@ -417,7 +416,7 @@ static struct task_struct *kapmd_task; * APM event names taken from the APM 1.2 specification. These are * the message codes that the BIOS uses to tell us about events */ -static const char * const apm_event_name[] = { +static const char * const apm_event_name[] = { "system standby", "system suspend", "normal resume", @@ -435,14 +434,14 @@ static const char * const apm_event_name[] = { typedef struct lookup_t { int key; - char * msg; + char *msg; } lookup_t; /* * The BIOS returns a set of standard error codes in AX when the * carry flag is set. */ - + static const lookup_t error_table[] = { /* N/A { APM_SUCCESS, "Operation succeeded" }, */ { APM_DISABLED, "Power management disabled" }, @@ -472,24 +471,25 @@ static const lookup_t error_table[] = { * Write a meaningful log entry to the kernel log in the event of * an APM error. */ - + static void apm_error(char *str, int err) { - int i; + int i; for (i = 0; i < ERROR_COUNT; i++) - if (error_table[i].key == err) break; + if (error_table[i].key == err) + break; if (i < ERROR_COUNT) printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); else printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", - str, err); + str, err); } /* * Lock APM functionality to physical CPU 0 */ - + #ifdef CONFIG_SMP static cpumask_t apm_save_cpus(void) @@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask) /* * No CPU lockdown needed on a uniprocessor */ - + #define apm_save_cpus() (current->cpus_allowed) #define apm_restore_cpus(x) (void)(x) @@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags) * code is returned in AH (bits 8-15 of eax) and this function * returns non-zero. */ - + static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) { @@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, struct desc_struct *gdt; cpus = apm_save_cpus(); - + cpu = get_cpu(); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; @@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, gdt[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); - + return *eax & 0xff; } @@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) struct desc_struct *gdt; cpus = apm_save_cpus(); - + cpu = get_cpu(); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; @@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) static int apm_driver_version(u_short *val) { - u32 eax; + u32 eax; if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) return (eax >> 8) & 0xff; @@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val) * that APM 1.2 is in use. If no messges are pending the value 0x80 * is returned (No power management events pending). */ - + static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) { - u32 eax; - u32 ebx; - u32 ecx; - u32 dummy; + u32 eax; + u32 ebx; + u32 ecx; + u32 dummy; if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, - &dummy, &dummy)) + &dummy, &dummy)) return (eax >> 8) & 0xff; *event = ebx; if (apm_info.connection_version < 0x0102) @@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) * The state holds the state to transition to, which may in fact * be an acceptance of a BIOS requested state change. */ - + static int set_power_state(u_short what, u_short state) { - u32 eax; + u32 eax; if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) return (eax >> 8) & 0xff; @@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state) * * Transition the entire system into a new APM power state. */ - + static int set_system_power_state(u_short state) { return set_power_state(APM_DEVICE_ALL, state); @@ -766,13 +766,13 @@ static int set_system_power_state(u_short state) * to handle the idle request. On a success the function returns 1 * if the BIOS did clock slowing or 0 otherwise. */ - + static int apm_do_idle(void) { - u32 eax; - u8 ret = 0; - int idled = 0; - int polling; + u32 eax; + u8 ret = 0; + int idled = 0; + int polling; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -799,10 +799,9 @@ static int apm_do_idle(void) /* This always fails on some SMP boards running UP kernels. * Only report the failure the first 5 times. */ - if (++t < 5) - { + if (++t < 5) { printk(KERN_DEBUG "apm_do_idle failed (%d)\n", - (eax >> 8) & 0xff); + (eax >> 8) & 0xff); t = jiffies; } return -1; @@ -814,15 +813,15 @@ static int apm_do_idle(void) /** * apm_do_busy - inform the BIOS the CPU is busy * - * Request that the BIOS brings the CPU back to full performance. + * Request that the BIOS brings the CPU back to full performance. */ - + static void apm_do_busy(void) { - u32 dummy; + u32 dummy; if (clock_slowed || ALWAYS_CALL_BUSY) { - (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); clock_slowed = 0; } } @@ -833,15 +832,15 @@ static void apm_do_busy(void) * power management - we probably want * to conserve power. */ -#define IDLE_CALC_LIMIT (HZ * 100) -#define IDLE_LEAKY_MAX 16 +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 static void (*original_pm_idle)(void) __read_mostly; /** * apm_cpu_idle - cpu idling for APM capable Linux * - * This is the idling function the kernel executes when APM is available. It + * This is the idling function the kernel executes when APM is available. It * tries to do BIOS powermanagement based on the average system idle time. * Furthermore it calls the system default idle routine. */ @@ -882,7 +881,8 @@ recalc: t = jiffies; switch (apm_do_idle()) { - case 0: apm_idle_done = 1; + case 0: + apm_idle_done = 1; if (t != jiffies) { if (bucket) { bucket = IDLE_LEAKY_MAX; @@ -893,7 +893,8 @@ recalc: continue; } break; - case 1: apm_idle_done = 1; + case 1: + apm_idle_done = 1; break; default: /* BIOS refused */ break; @@ -921,10 +922,10 @@ recalc: * the SMP call on CPU0 as some systems will only honour this call * on their first cpu. */ - + static void apm_power_off(void) { - unsigned char po_bios_call[] = { + unsigned char po_bios_call[] = { 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 0x8e, 0xd0, /* movw ax,ss */ 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ @@ -935,13 +936,12 @@ static void apm_power_off(void) }; /* Some bioses don't like being called from CPU != 0 */ - if (apm_info.realmode_power_off) - { + if (apm_info.realmode_power_off) { (void)apm_save_cpus(); machine_real_restart(po_bios_call, sizeof(po_bios_call)); + } else { + (void)set_system_power_state(APM_STATE_OFF); } - else - (void) set_system_power_state(APM_STATE_OFF); } #ifdef CONFIG_APM_DO_ENABLE @@ -950,17 +950,17 @@ static void apm_power_off(void) * apm_enable_power_management - enable BIOS APM power management * @enable: enable yes/no * - * Enable or disable the APM BIOS power services. + * Enable or disable the APM BIOS power services. */ - + static int apm_enable_power_management(int enable) { - u32 eax; + u32 eax; if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) return APM_NOT_ENGAGED; if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, - enable, &eax)) + enable, &eax)) return (eax >> 8) & 0xff; if (enable) apm_info.bios.flags &= ~APM_BIOS_DISABLED; @@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable) * if reported is a lifetime in secodnds/minutes at current powwer * consumption. */ - + static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 dummy; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 dummy; if (apm_info.get_power_status_broken) return APM_32_UNSUPPORTED; if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, - &eax, &ebx, &ecx, &edx, &dummy)) + &eax, &ebx, &ecx, &edx, &dummy)) return (eax >> 8) & 0xff; *status = ebx; *bat = ecx; @@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) static int apm_get_battery_status(u_short which, u_short *status, u_short *bat, u_short *life, u_short *nbat) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; if (apm_info.connection_version < 0x0102) { /* pretend we only have one battery. */ @@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status, } if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) + &ebx, &ecx, &edx, &esi)) return (eax >> 8) & 0xff; *status = ebx; *bat = ecx; @@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status, * Activate or deactive power management on either a specific device * or the entire system (%APM_DEVICE_ALL). */ - + static int apm_engage_power_management(u_short device, int enable) { - u32 eax; + u32 eax; if ((enable == 0) && (device == APM_DEVICE_ALL) && (apm_info.bios.flags & APM_BIOS_DISABLED)) @@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable) * all video devices. Typically the BIOS will do laptop backlight and * monitor powerdown for us. */ - + static int apm_console_blank(int blank) { int error = APM_NOT_ENGAGED; /* silence gcc */ @@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as) static void queue_event(apm_event_t event, struct apm_user *sender) { - struct apm_user * as; + struct apm_user *as; spin_lock(&user_list_lock); if (user_list == NULL) @@ -1174,11 +1174,11 @@ static void reinit_timer(void) spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ - outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); - outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */ udelay(10); - outb(LATCH >> 8, PIT_CH0); /* MSB */ + outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); spin_unlock_irqrestore(&i8253_lock, flags); #endif @@ -1186,7 +1186,7 @@ static void reinit_timer(void) static int suspend(int vetoable) { - int err; + int err; struct apm_user *as; if (pm_send_all(PM_SUSPEND, (void *)3)) { @@ -1239,7 +1239,7 @@ static int suspend(int vetoable) static void standby(void) { - int err; + int err; local_irq_disable(); device_power_down(PMSG_SUSPEND); @@ -1256,8 +1256,8 @@ static void standby(void) static apm_event_t get_event(void) { - int error; - apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ apm_eventinfo_t info; static int notified; @@ -1275,9 +1275,9 @@ static apm_event_t get_event(void) static void check_events(void) { - apm_event_t event; - static unsigned long last_resume; - static int ignore_bounce; + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; while ((event = get_event()) != 0) { if (debug) { @@ -1289,7 +1289,7 @@ static void check_events(void) "event 0x%02x\n", event); } if (ignore_bounce - && ((jiffies - last_resume) > bounce_interval)) + && (time_after(jiffies, last_resume + bounce_interval))) ignore_bounce = 0; switch (event) { @@ -1357,7 +1357,7 @@ static void check_events(void) /* * We are not allowed to reject a critical suspend. */ - (void) suspend(0); + (void)suspend(0); break; } } @@ -1365,12 +1365,12 @@ static void check_events(void) static void apm_event_handler(void) { - static int pending_count = 4; - int err; + static int pending_count = 4; + int err; if ((standbys_pending > 0) || (suspends_pending > 0)) { if ((apm_info.connection_version > 0x100) && - (pending_count-- <= 0)) { + (pending_count-- <= 0)) { pending_count = 4; if (debug) printk(KERN_DEBUG "apm: setting state busy\n"); @@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func) static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { - struct apm_user * as; - int i; - apm_event_t event; + struct apm_user *as; + int i; + apm_event_t event; as = fp->private_data; if (check_apm_user(as, "read")) @@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t * return 0; } -static unsigned int do_poll(struct file *fp, poll_table * wait) +static unsigned int do_poll(struct file *fp, poll_table *wait) { - struct apm_user * as; + struct apm_user *as; as = fp->private_data; if (check_apm_user(as, "poll")) @@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait) return 0; } -static int do_ioctl(struct inode * inode, struct file *filp, +static int do_ioctl(struct inode *inode, struct file *filp, u_int cmd, u_long arg) { - struct apm_user * as; + struct apm_user *as; as = filp->private_data; if (check_apm_user(as, "ioctl")) @@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp, return 0; } -static int do_release(struct inode * inode, struct file * filp) +static int do_release(struct inode *inode, struct file *filp) { - struct apm_user * as; + struct apm_user *as; as = filp->private_data; if (check_apm_user(as, "release")) @@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp) if (suspends_pending <= 0) (void) suspend(1); } - spin_lock(&user_list_lock); + spin_lock(&user_list_lock); if (user_list == as) user_list = as->next; else { - struct apm_user * as1; + struct apm_user *as1; for (as1 = user_list; (as1 != NULL) && (as1->next != as); @@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp) return 0; } -static int do_open(struct inode * inode, struct file * filp) +static int do_open(struct inode *inode, struct file *filp) { - struct apm_user * as; + struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { @@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp) as->suspends_read = as->standbys_read = 0; /* * XXX - this is a tiny bit broken, when we consider BSD - * process accounting. If the device is opened by root, we + * process accounting. If the device is opened by root, we * instantly flag that we used superuser privs. Who knows, * we might close the device immediately without doing a * privileged operation -- cevans @@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v) 8) min = minutes; sec = seconds */ seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", - driver_version, - (apm_info.bios.version >> 8) & 0xff, - apm_info.bios.version & 0xff, - apm_info.bios.flags, - ac_line_status, - battery_status, - battery_flag, - percentage, - time_units, - units); + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); return 0; } @@ -1684,8 +1684,8 @@ static int apm(void *unused) unsigned short cx; unsigned short dx; int error; - char * power_stat; - char * bat_stat; + char *power_stat; + char *bat_stat; #ifdef CONFIG_SMP /* 2002/08/01 - WT @@ -1744,23 +1744,41 @@ static int apm(void *unused) } } - if (debug && (num_online_cpus() == 1 || smp )) { + if (debug && (num_online_cpus() == 1 || smp)) { error = apm_get_power_status(&bx, &cx, &dx); if (error) printk(KERN_INFO "apm: power status not available\n"); else { switch ((bx >> 8) & 0xff) { - case 0: power_stat = "off line"; break; - case 1: power_stat = "on line"; break; - case 2: power_stat = "on backup power"; break; - default: power_stat = "unknown"; break; + case 0: + power_stat = "off line"; + break; + case 1: + power_stat = "on line"; + break; + case 2: + power_stat = "on backup power"; + break; + default: + power_stat = "unknown"; + break; } switch (bx & 0xff) { - case 0: bat_stat = "high"; break; - case 1: bat_stat = "low"; break; - case 2: bat_stat = "critical"; break; - case 3: bat_stat = "charging"; break; - default: bat_stat = "unknown"; break; + case 0: + bat_stat = "high"; + break; + case 1: + bat_stat = "low"; + break; + case 2: + bat_stat = "critical"; + break; + case 3: + bat_stat = "charging"; + break; + default: + bat_stat = "unknown"; + break; } printk(KERN_INFO "apm: AC %s, battery status %s, battery life ", @@ -1777,8 +1795,8 @@ static int apm(void *unused) printk("unknown\n"); else printk("%d %s\n", dx & 0x7fff, - (dx & 0x8000) ? - "minutes" : "seconds"); + (dx & 0x8000) ? + "minutes" : "seconds"); } } } @@ -1803,7 +1821,7 @@ static int apm(void *unused) #ifndef MODULE static int __init apm_setup(char *str) { - int invert; + int invert; while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "off", 3) == 0) @@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str) if ((strncmp(str, "power-off", 9) == 0) || (strncmp(str, "power_off", 9) == 0)) power_off = !invert; - if (strncmp(str, "smp", 3) == 0) - { + if (strncmp(str, "smp", 3) == 0) { smp = !invert; idle_threshold = 100; } if ((strncmp(str, "allow-ints", 10) == 0) || (strncmp(str, "allow_ints", 10) == 0)) - apm_info.allow_ints = !invert; + apm_info.allow_ints = !invert; if ((strncmp(str, "broken-psr", 10) == 0) || (strncmp(str, "broken_psr", 10) == 0)) apm_info.get_power_status_broken = !invert; @@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d) */ static int __init broken_ps2_resume(const struct dmi_system_id *d) { - printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug " + "workaround hopefully not needed.\n", d->ident); return 0; } @@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d) { if (apm_info.realmode_power_off == 0) { apm_info.realmode_power_off = 1; - printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); + printk(KERN_INFO "%s bios detected. " + "Using realmode poweroff only.\n", d->ident); } return 0; } @@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d) { if (apm_info.allow_ints == 0) { apm_info.allow_ints = 1; - printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Enabling interrupts during APM calls.\n", d->ident); } return 0; } @@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d) { if (apm_info.disabled == 0) { apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); } return 0; } @@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) { if (apm_info.disabled == 0) { apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); printk(KERN_INFO "download from support.intel.com \n"); } @@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d) { if (apm_info.forbid_idle == 0) { apm_info.forbid_idle = 1; - printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM idle calls.\n", d->ident); } return 0; } @@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d) static int __init broken_apm_power(const struct dmi_system_id *d) { apm_info.get_power_status_broken = 1; - printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); + printk(KERN_WARNING "BIOS strings suggest APM bugs, " + "disabling power status reporting.\n"); return 0; } @@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d) static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) { apm_info.get_power_status_swabinminutes = 1; - printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); + printk(KERN_WARNING "BIOS strings suggest APM reports battery life " + "in minutes and wrong byte order.\n"); return 0; } @@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Dell Inspiron 2500", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* Allow interrupts during suspend on Dell Inspiron laptops*/ set_apm_ints, "Dell Inspiron", { @@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Dell Dimension 4100", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), - DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* Allow interrupts during suspend on Compaq Laptops*/ set_apm_ints, "Compaq 12XL125", { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, + DMI_MATCH(DMI_BIOS_VERSION, "4.06"), }, }, { /* Allow interrupts during APM or the clock goes slow */ set_apm_ints, "ASUSTeK", @@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Sharp PC-PJ/AX", { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), - DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), - DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), }, }, { /* APM crashes */ apm_is_horked, "Dell Inspiron 2500", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* APM idle hangs */ apm_likes_to_melt, "Jabil AMD", @@ -2203,11 +2228,11 @@ static int __init apm_init(void) return -ENODEV; } printk(KERN_INFO - "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", - ((apm_info.bios.version >> 8) & 0xff), - (apm_info.bios.version & 0xff), - apm_info.bios.flags, - driver_version); + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { printk(KERN_INFO "apm: no 32 bit BIOS support\n"); return -ENODEV; @@ -2312,9 +2337,9 @@ static int __init apm_init(void) } wake_up_process(kapmd_task); - if (num_online_cpus() > 1 && !smp ) { + if (num_online_cpus() > 1 && !smp) { printk(KERN_NOTICE - "apm: disabled - APM is not SMP safe (power off active).\n"); + "apm: disabled - APM is not SMP safe (power off active).\n"); return 0; } @@ -2339,7 +2364,7 @@ static int __init apm_init(void) static void __exit apm_exit(void) { - int error; + int error; if (set_pm_idle) { pm_idle = original_pm_idle; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 0e45981b2dd7..afd84463b712 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -38,15 +38,15 @@ void foo(void); void foo(void) { - OFFSET(SIGCONTEXT_eax, sigcontext, eax); - OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); - OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); - OFFSET(SIGCONTEXT_edx, sigcontext, edx); - OFFSET(SIGCONTEXT_esi, sigcontext, esi); - OFFSET(SIGCONTEXT_edi, sigcontext, edi); - OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); - OFFSET(SIGCONTEXT_esp, sigcontext, esp); - OFFSET(SIGCONTEXT_eip, sigcontext, eip); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); BLANK(); OFFSET(CPUINFO_x86, cpuinfo_x86, x86); @@ -70,39 +70,38 @@ void foo(void) OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, Xgt_desc_struct, size); - OFFSET(GDS_address, Xgt_desc_struct, address); - OFFSET(GDS_pad, Xgt_desc_struct, pad); + OFFSET(GDS_size, desc_ptr, size); + OFFSET(GDS_address, desc_ptr, address); BLANK(); - OFFSET(PT_EBX, pt_regs, ebx); - OFFSET(PT_ECX, pt_regs, ecx); - OFFSET(PT_EDX, pt_regs, edx); - OFFSET(PT_ESI, pt_regs, esi); - OFFSET(PT_EDI, pt_regs, edi); - OFFSET(PT_EBP, pt_regs, ebp); - OFFSET(PT_EAX, pt_regs, eax); - OFFSET(PT_DS, pt_regs, xds); - OFFSET(PT_ES, pt_regs, xes); - OFFSET(PT_FS, pt_regs, xfs); - OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); - OFFSET(PT_EIP, pt_regs, eip); - OFFSET(PT_CS, pt_regs, xcs); - OFFSET(PT_EFLAGS, pt_regs, eflags); - OFFSET(PT_OLDESP, pt_regs, esp); - OFFSET(PT_OLDSS, pt_regs, xss); + OFFSET(PT_EBX, pt_regs, bx); + OFFSET(PT_ECX, pt_regs, cx); + OFFSET(PT_EDX, pt_regs, dx); + OFFSET(PT_ESI, pt_regs, si); + OFFSET(PT_EDI, pt_regs, di); + OFFSET(PT_EBP, pt_regs, bp); + OFFSET(PT_EAX, pt_regs, ax); + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); + OFFSET(PT_EFLAGS, pt_regs, flags); + OFFSET(PT_OLDESP, pt_regs, sp); + OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); - OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); OFFSET(pbe_address, pbe, address); OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - + /* Offset from the sysenter stack to tss.sp0 */ + DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); @@ -111,8 +110,6 @@ void foo(void) DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); - DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); #ifdef CONFIG_PARAVIRT @@ -123,7 +120,7 @@ void foo(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d1b6ed98774e..494e1e096ee6 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -38,7 +38,6 @@ int main(void) #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) ENTRY(state); ENTRY(flags); - ENTRY(thread); ENTRY(pid); BLANK(); #undef ENTRY @@ -47,6 +46,9 @@ int main(void) ENTRY(addr_limit); ENTRY(preempt_count); ENTRY(status); +#ifdef CONFIG_IA32_EMULATION + ENTRY(sysenter_return); +#endif BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) @@ -59,17 +61,31 @@ int main(void) ENTRY(data_offset); BLANK(); #undef ENTRY +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + + #ifdef CONFIG_IA32_EMULATION #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); + ENTRY(ax); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(si); + ENTRY(di); + ENTRY(bp); + ENTRY(sp); + ENTRY(ip); BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, @@ -81,14 +97,14 @@ int main(void) DEFINE(pbe_next, offsetof(struct pbe, next)); BLANK(); #define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) - ENTRY(rbx); - ENTRY(rbx); - ENTRY(rcx); - ENTRY(rdx); - ENTRY(rsp); - ENTRY(rbp); - ENTRY(rsi); - ENTRY(rdi); + ENTRY(bx); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(sp); + ENTRY(bp); + ENTRY(si); + ENTRY(di); ENTRY(r8); ENTRY(r9); ENTRY(r10); @@ -97,7 +113,7 @@ int main(void) ENTRY(r13); ENTRY(r14); ENTRY(r15); - ENTRY(eflags); + ENTRY(flags); BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) @@ -108,7 +124,7 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); + DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); BLANK(); DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); BLANK(); diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 0b9860530a6b..30f25a75fe28 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -1,8 +1,6 @@ /* * Implement 'Simple Boot Flag Specification 2.0' */ - - #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -14,40 +12,38 @@ #include <linux/mc146818rtc.h> - #define SBF_RESERVED (0x78) #define SBF_PNPOS (1<<0) #define SBF_BOOTING (1<<1) #define SBF_DIAG (1<<2) #define SBF_PARITY (1<<7) - int sbf_port __initdata = -1; /* set via acpi_boot_init() */ - static int __init parity(u8 v) { int x = 0; int i; - - for(i=0;i<8;i++) - { - x^=(v&1); - v>>=1; + + for (i = 0; i < 8; i++) { + x ^= (v & 1); + v >>= 1; } + return x; } static void __init sbf_write(u8 v) { unsigned long flags; - if(sbf_port != -1) - { + + if (sbf_port != -1) { v &= ~SBF_PARITY; - if(!parity(v)) - v|=SBF_PARITY; + if (!parity(v)) + v |= SBF_PARITY; - printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", + sbf_port, v); spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(v, sbf_port); @@ -57,33 +53,41 @@ static void __init sbf_write(u8 v) static u8 __init sbf_read(void) { - u8 v; unsigned long flags; - if(sbf_port == -1) + u8 v; + + if (sbf_port == -1) return 0; + spin_lock_irqsave(&rtc_lock, flags); v = CMOS_READ(sbf_port); spin_unlock_irqrestore(&rtc_lock, flags); + return v; } static int __init sbf_value_valid(u8 v) { - if(v&SBF_RESERVED) /* Reserved bits */ + if (v & SBF_RESERVED) /* Reserved bits */ return 0; - if(!parity(v)) + if (!parity(v)) return 0; + return 1; } static int __init sbf_init(void) { u8 v; - if(sbf_port == -1) + + if (sbf_port == -1) return 0; + v = sbf_read(); - if(!sbf_value_valid(v)) - printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); + if (!sbf_value_valid(v)) { + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from " + "CMOS RAM was invalid\n", v); + } v &= ~SBF_RESERVED; v &= ~SBF_BOOTING; @@ -92,7 +96,7 @@ static int __init sbf_init(void) v |= SBF_PNPOS; #endif sbf_write(v); + return 0; } - module_init(sbf_init); diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c index 9a189cef6404..8f520f93ffd4 100644 --- a/arch/x86/kernel/bugs_64.c +++ b/arch/x86/kernel/bugs_64.c @@ -13,7 +13,6 @@ void __init check_bugs(void) { identify_cpu(&boot_cpu_data); - mtrr_bp_init(); #if !defined(CONFIG_SMP) printk("CPU: "); print_cpu_info(&boot_cpu_data); diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 3e91d3ee26ec..238468ae1993 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) - set_bit(cb->feature, c->x86_capability); + set_cpu_cap(c, cb->feature); } } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1ff88c7f45cf..06fa159232fd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void) int force_mwait __cpuinitdata; +void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + if (cpuid_eax(0x80000000) >= 0x80000007) { + c->x86_power = cpuid_edx(0x80000007); + if (c->x86_power & (1<<8)) + set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); + } +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif + early_init_amd(c); + /* * FIXME: We should handle the K5 here. Set up the write * range and also turn on MSR 83 bits 4 and 31 (write alloc, @@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; } - if (cpuid_eax(0x80000000) >= 0x80000007) { - c->x86_power = cpuid_edx(0x80000007); - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - } - #ifdef CONFIG_X86_HT /* * On a AMD multi core setup the lower bits of the APIC id @@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) local_apic_timer_disabled = 1; #endif - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, c->x86_capability); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_bit(X86_FEATURE_MCE, c->x86_capability); + + if (cpu_has_xmm) + set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 205fd5ba57f7..9b95edcfc6ae 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,7 @@ #include <linux/utsname.h> #include <asm/bugs.h> #include <asm/processor.h> +#include <asm/processor-flags.h> #include <asm/i387.h> #include <asm/msr.h> #include <asm/paravirt.h> @@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium); static int __init no_387(char *s) { boot_cpu_data.hard_math = 0; - write_cr0(0xE | read_cr0()); + write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0()); return 1; } @@ -153,7 +154,7 @@ static void __init check_config(void) * If we configured ourselves for a TSC, we'd better have one! */ #ifdef CONFIG_X86_TSC - if (!cpu_has_tsc && !tsc_disable) + if (!cpu_has_tsc) panic("Kernel compiled for Pentium+, requires TSC feature!"); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e2fcf2051bdb..db28aa9e2f69 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -22,43 +22,48 @@ #include "cpu.h" DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + static int cachesize_override __cpuinitdata = -1; -static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; -static int disable_x86_sep __cpuinitdata; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -extern int disable_pse; - static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ @@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPUs to not use it... */ - disable_x86_fxsr = 1; - - /* - * ... and clear the bits early in the boot_cpu_data - * so that the bootup process doesn't try to do this - * either. - */ - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); return 1; } __setup("nofxsr", x86_fxsr_setup); @@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup); static int __init x86_sep_setup(char * s) { - disable_x86_sep = 1; + setup_clear_cpu_cap(X86_FEATURE_SEP); return 1; } __setup("nosep", x86_sep_setup); @@ -281,6 +278,33 @@ void __init cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; } } +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + int ebx; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (have_cpuid_p()) { + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + + } + +} /* Do minimum CPU detection early. Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. @@ -300,6 +324,17 @@ static void __init early_cpu_detect(void) cpu_detect(c); get_cpu_vendor(c, 1); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + + early_get_cap(c); } static void __cpuinit generic_identify(struct cpuinfo_x86 * c) @@ -357,8 +392,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) init_scattered_cpuid_features(c); } - early_intel_workaround(c); - #ifdef CONFIG_X86_HT c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif @@ -392,7 +425,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -418,20 +451,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); - printk(KERN_DEBUG "CPU: After generic identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - if (this_cpu->c_identify) { + if (this_cpu->c_identify) this_cpu->c_identify(c); - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - } - /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -453,23 +475,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * we do "generic changes." */ - /* TSC disabled? */ - if ( tsc_disable ) - clear_bit(X86_FEATURE_TSC, c->x86_capability); - - /* FXSR disabled? */ - if (disable_x86_fxsr) { - clear_bit(X86_FEATURE_FXSR, c->x86_capability); - clear_bit(X86_FEATURE_XMM, c->x86_capability); - } - - /* SEP disabled? */ - if (disable_x86_sep) - clear_bit(X86_FEATURE_SEP, c->x86_capability); - - if (disable_pse) - clear_bit(X86_FEATURE_PSE, c->x86_capability); - /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; @@ -482,13 +487,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After all inits, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -501,8 +499,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] ^= cleared_cpu_caps[i]; + /* Init Machine Check Exception if available. */ mcheck_init(c); + + select_idle_routine(c); } void __init identify_boot_cpu(void) @@ -510,7 +514,6 @@ void __init identify_boot_cpu(void) identify_cpu(&boot_cpu_data); sysenter_setup(); enable_sep_cpu(); - mtrr_bp_init(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -567,6 +570,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } #endif +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -590,6 +600,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk("\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; /* This is hacky. :) @@ -620,21 +641,13 @@ void __init early_cpu_init(void) nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; -#endif } /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PERCPU; + regs->fs = __KERNEL_PERCPU; return regs; } @@ -642,7 +655,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) * it's on the real one. */ void switch_to_new_gdt(void) { - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); gdt_descr.size = GDT_SIZE - 1; @@ -672,12 +685,6 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } load_idt(&idt_descr); switch_to_new_gdt(); @@ -691,7 +698,7 @@ void __cpuinit cpu_init(void) BUG(); enter_lazy_tlb(&init_mm, curr); - load_esp0(t, thread); + load_sp0(t, thread); set_tss_desc(cpu,t); load_TR_desc(); load_LDT(&init_mm.context); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2f6432cef6ff..ad6527a5beb1 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -24,5 +24,6 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -extern void early_intel_workaround(struct cpuinfo_x86 *c); +extern void early_init_intel(struct cpuinfo_x86 *c); +extern void early_init_amd(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index fea0af0476b9..a962dcb9c408 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -67,7 +67,8 @@ struct acpi_cpufreq_data { unsigned int cpu_feature; }; -static struct acpi_cpufreq_data *drv_data[NR_CPUS]; +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); + /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask) if (unlikely(cpus_empty(mask))) return 0; - switch (drv_data[first_cpu(mask)]->cpu_feature) { + switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; - perf = drv_data[first_cpu(mask)]->acpi_data; + perf = per_cpu(drv_data, first_cpu(mask))->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; @@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = drv_data[cpu]->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed(current, saved_mask); @@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - struct acpi_cpufreq_data *data = drv_data[cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; dprintk("get_cur_freq_on_cpu (%d)\n", cpu); @@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq, static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); struct acpi_processor_performance *perf; struct cpufreq_freqs freqs; cpumask_t online_policy_cpus; @@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_verify\n"); @@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; data->acpi_data = percpu_ptr(acpi_perf_data, cpu); - drv_data[cpu] = data; + per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -714,20 +715,20 @@ err_unreg: acpi_processor_unregister_performance(perf, cpu); err_free: kfree(data); - drv_data[cpu] = NULL; + per_cpu(drv_data, cpu) = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); - drv_data[policy->cpu] = NULL; + per_cpu(drv_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); kfree(data); @@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_resume\n"); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 749d00cb2ebd..06fcce516d51 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle, if ( acpi_bus_get_device(obj_handle, &d) ) { return 0; } - *return_value = (void *)acpi_driver_data(d); + *return_value = acpi_driver_data(d); return 1; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 99e1ef9939be..a0522735dd9d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -52,7 +52,7 @@ /* serialize freq changes */ static DEFINE_MUTEX(fidvid_mutex); -static struct powernow_k8_data *powernow_data[NR_CPUS]; +static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); static int cpu_family = CPU_OPTERON; @@ -1018,7 +1018,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask = CPU_MASK_ALL; - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; unsigned int newstate; @@ -1094,7 +1094,7 @@ err_out: /* Driver entry point to verify the policy and range of frequencies */ static int powernowk8_verify(struct cpufreq_policy *pol) { - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); if (!data) return -EINVAL; @@ -1202,7 +1202,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); - powernow_data[pol->cpu] = data; + per_cpu(powernow_data, pol->cpu) = data; return 0; @@ -1216,7 +1216,7 @@ err_out: static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) { - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); if (!data) return -EINVAL; @@ -1237,7 +1237,7 @@ static unsigned int powernowk8_get (unsigned int cpu) cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; - data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; + data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu))); if (!data) return -EINVAL; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 88d66fb8411d..404a6a2d4016 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -5,6 +5,7 @@ #include <asm/dma.h> #include <asm/io.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include <asm/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> @@ -126,15 +127,12 @@ static void __cpuinit set_cx86_reorder(void) static void __cpuinit set_cx86_memwb(void) { - u32 cr0; - printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ - cr0 = 0x20000000; - write_cr0(read_cr0() | cr0); + write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cc8c501b9f39..d1c372b018db 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,6 +11,8 @@ #include <asm/pgtable.h> #include <asm/msr.h> #include <asm/uaccess.h> +#include <asm/ptrace.h> +#include <asm/ds.h> #include "cpu.h" @@ -27,13 +29,14 @@ struct movsl_mask movsl_mask __read_mostly; #endif -void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) +void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) - return; /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } /* @@ -113,6 +116,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned int l2 = 0; char *p = NULL; + early_init_intel(c); + #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs @@ -132,7 +137,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif - select_idle_routine(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9 ) { unsigned eax = cpuid_eax(10); @@ -201,16 +205,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif + if (cpu_has_xmm2) + set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability); if (c->x86 == 15) { set_bit(X86_FEATURE_P4, c->x86_capability); - set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability); } if (c->x86 == 6) set_bit(X86_FEATURE_P3, c->x86_capability); - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); @@ -219,6 +220,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (!(l1 & (1<<12))) set_bit(X86_FEATURE_PEBS, c->x86_capability); } + + if (cpu_has_bts) + ds_init_intel(c); } static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) @@ -342,5 +346,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) EXPORT_SYMBOL(cmpxchg_386_u32); #endif +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + // arch_initcall(intel_cpu_init); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 9f530ff43c21..8b4507b8469b 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -733,10 +733,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) if (unlikely(retval < 0)) return retval; - cache_kobject[cpu]->parent = &sys_dev->kobj; - kobject_set_name(cache_kobject[cpu], "%s", "cache"); - cache_kobject[cpu]->ktype = &ktype_percpu_entry; - retval = kobject_register(cache_kobject[cpu]); + retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry, + &sys_dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -746,23 +744,23 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu,i); this_object->cpu = cpu; this_object->index = i; - this_object->kobj.parent = cache_kobject[cpu]; - kobject_set_name(&(this_object->kobj), "index%1lu", i); - this_object->kobj.ktype = &ktype_cache; - retval = kobject_register(&(this_object->kobj)); + retval = kobject_init_and_add(&(this_object->kobj), + &ktype_cache, cache_kobject[cpu], + "index%1lu", i); if (unlikely(retval)) { for (j = 0; j < i; j++) { - kobject_unregister( - &(INDEX_KOBJECT_PTR(cpu,j)->kobj)); + kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); } - kobject_unregister(cache_kobject[cpu]); + kobject_put(cache_kobject[cpu]); cpuid4_cache_sysfs_exit(cpu); break; } + kobject_uevent(&(this_object->kobj), KOBJ_ADD); } if (!retval) cpu_set(cpu, cache_dev_map); + kobject_uevent(cache_kobject[cpu], KOBJ_ADD); return retval; } @@ -778,8 +776,8 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpu_clear(cpu, cache_dev_map); for (i = 0; i < num_cache_leaves; i++) - kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); - kobject_unregister(cache_kobject[cpu]); + kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_put(cache_kobject[cpu]); cpuid4_cache_sysfs_exit(cpu); } diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index eef63e3630c2..e633c9c2b764 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine Check Handler For AMD Athlon/Duron */ -static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) +static void k7_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - for (i=1; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 1; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high&(1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); /* Clear it */ - wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); /* Serialize */ wmb(); add_taint(TAINT_MACHINE_CHECK); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 81fb6e2d35f3..ae9f628838f1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ -extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *, long error_code); extern int nr_mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 34c781eddee4..a5182dcd94ae 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -22,13 +22,13 @@ int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* Handle unconfigured int18 (should never happen) */ -static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) +static void unexpected_machine_check(struct pt_regs * regs, long error_code) { printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 4b21d29fb5aa..9a699ed03598 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); * separate MCEs from kernel messages to avoid bogus bug reports. */ -struct mce_log mcelog = { +static struct mce_log mcelog = { MCE_LOG_SIGNATURE, MCE_LOG_LEN, }; @@ -80,7 +80,7 @@ void mce_log(struct mce *mce) /* When the buffer fills up discard new entries. Assume that the earlier errors are the more interesting. */ if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); return; } /* Old left over entry. Skip. */ @@ -110,12 +110,12 @@ static void print_mce(struct mce *m) KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); - if (m->rip) { + if (m->ip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->rip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->rip); + print_symbol("{%s}", m->ip); printk("\n"); } printk(KERN_EMERG "TSC %Lx ", m->tsc); @@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c) static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) { if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->rip = regs->rip; + m->ip = regs->ip; m->cs = regs->cs; } else { - m->rip = 0; + m->ip = 0; m->cs = 0; } if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->rip); + rdmsrl(rip_msr, m->ip); m->cs = 0; } } @@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_inc(&mce_entry); - if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 18, - SIGKILL); - if (!banks) + if ((regs + && notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) + || !banks) goto out2; memset(&m, 0, sizeof(struct mce)); @@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.rip && (panicm.cs & 3); + user_space = panicm.ip && (panicm.cs & 3); /* * If we know that the error was in user space, send a @@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { unsigned long *cpu_tsc; - static DECLARE_MUTEX(mce_read_sem); + static DEFINE_MUTEX(mce_read_mutex); unsigned next; char __user *buf = ubuf; int i, err; @@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (!cpu_tsc) return -ENOMEM; - down(&mce_read_sem); + mutex_lock(&mce_read_mutex); next = rcu_dereference(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - up(&mce_read_sem); + mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); return -EINVAL; } @@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } - up(&mce_read_sem); + mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); return err ? -EFAULT : buf - ubuf; } @@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) return 0; } -static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, - unsigned long arg) +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; @@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = { .release = mce_release, .read = mce_read, .poll = mce_poll, - .ioctl = mce_ioctl, + .unlocked_ioctl = mce_ioctl, }; static struct miscdevice mce_log_device = { @@ -745,7 +744,7 @@ static void mce_restart(void) static struct sysdev_class mce_sysclass = { .resume = mce_resume, - set_kset_name("machinecheck"), + .name = "machinecheck", }; DEFINE_PER_CPU(struct sys_device, device_mce); @@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier = { +static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 752fb16a817d..32671da8184e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -65,7 +65,7 @@ static struct threshold_block threshold_defaults = { }; struct threshold_bank { - struct kobject kobj; + struct kobject *kobj; struct threshold_block *blocks; cpumask_t cpus; }; @@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) { unsigned int bank, block; unsigned int cpu = smp_processor_id(); + u8 lvt_off; u32 low = 0, high = 0, address = 0; for (bank = 0; bank < NR_BANKS; ++bank) { @@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif + lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0); + high &= ~MASK_LVTOFF_HI; - high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; + high |= lvt_off << 20; wrmsr(address, low, high); - setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); - threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); } @@ -432,10 +432,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, else per_cpu(threshold_banks, cpu)[bank]->blocks = b; - kobject_set_name(&b->kobj, "misc%i", block); - b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; - b->kobj.ktype = &threshold_ktype; - err = kobject_register(&b->kobj); + err = kobject_init_and_add(&b->kobj, &threshold_ktype, + per_cpu(threshold_banks, cpu)[bank]->kobj, + "misc%i", block); if (err) goto out_free; recurse: @@ -451,11 +450,14 @@ recurse: if (err) goto out_free; + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + return err; out_free: if (b) { - kobject_unregister(&b->kobj); + kobject_put(&b->kobj); kfree(b); } return err; @@ -489,7 +491,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, - &b->kobj, name); + b->kobj, name); if (err) goto out; @@ -505,16 +507,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - kobject_set_name(&b->kobj, "threshold_bank%i", bank); - b->kobj.parent = &per_cpu(device_mce, cpu).kobj; + b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); + if (!b->kobj) + goto out_free; + #ifndef CONFIG_SMP b->cpus = CPU_MASK_ALL; #else b->cpus = per_cpu(cpu_core_map, cpu); #endif - err = kobject_register(&b->kobj); - if (err) - goto out_free; per_cpu(threshold_banks, cpu)[bank] = b; @@ -531,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) continue; err = sysfs_create_link(&per_cpu(device_mce, i).kobj, - &b->kobj, name); + b->kobj, name); if (err) goto out; @@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu) int err = 0; for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); if (err) @@ -581,7 +582,7 @@ static void deallocate_threshold_block(unsigned int cpu, return; list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { - kobject_unregister(&pos->kobj); + kobject_put(&pos->kobj); list_del(&pos->miscj); kfree(pos); } @@ -627,7 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) deallocate_threshold_block(cpu, bank); free_out: - kobject_unregister(&b->kobj); + kobject_put(b->kobj); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; } @@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu) unsigned int bank; for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } } /* get notified when a cpu comes on/off */ -static int threshold_cpu_callback(struct notifier_block *nfb, +static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier = { +static struct notifier_block threshold_cpu_notifier __cpuinitdata = { .notifier_call = threshold_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index be4dabfee1f5..cb03345554a5 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -fastcall void smp_thermal_interrupt(struct pt_regs *regs) +void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); vendor_thermal_interrupt(regs); @@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) rdmsr (MSR_IA32_MCG_EIP, r->eip, h); } -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; intel_get_extended_msrs(&dbg); - printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags); - printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); - printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" + "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" + "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + smp_processor_id(), dbg.eip, dbg.eflags, + dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, dbg.esi, dbg.edi, dbg.ebp, dbg.esp); } - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 0; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high & (1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); } } diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 94bc43d950cf..a18310aaae0c 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine check handler for Pentium class Intel */ -static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) +static void pentium_machine_check(struct pt_regs * regs, long error_code) { u32 loaddr, hi, lotype; rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index deeae42ce199..74342604d30e 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine Check Handler For PII/PIII */ -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 0; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high & (1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); } } diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 9e424b6c293d..3d428d5afc52 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,7 +15,7 @@ #include "mce.h" /* Machine check handler for WinChip C6 */ -static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) +static void winchip_machine_check(struct pt_regs * regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK); diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 0949cdbf848a..ee2331b0e58f 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base, <base> The base address of the region. <size> The size of the region. If this is 0 the region is disabled. <type> The type of the region. - <do_safe> If TRUE, do the change safely. If FALSE, safety measures should - be done externally. [RETURNS] Nothing. */ { diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9964be3de2b7..8e139c70f888 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -4,6 +4,7 @@ #include <asm/msr.h> #include <asm/io.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include "mtrr.h" int arr3_protected; @@ -142,7 +143,7 @@ static void prepare_set(void) /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ - cr0 = read_cr0() | 0x40000000; + cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); wbinvd(); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 992f08dfbb6c..103d61a59b19 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,11 +9,12 @@ #include <asm/msr.h> #include <asm/system.h> #include <asm/cpufeature.h> +#include <asm/processor-flags.h> #include <asm/tlbflush.h> #include "mtrr.h" struct mtrr_state { - struct mtrr_var_range *var_ranges; + struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; unsigned char enabled; unsigned char have_fixed; @@ -85,12 +86,6 @@ void __init get_mtrr_state(void) struct mtrr_var_range *vrs; unsigned lo, dummy; - if (!mtrr_state.var_ranges) { - mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), - GFP_KERNEL); - if (!mtrr_state.var_ranges) - return; - } vrs = mtrr_state.var_ranges; rdmsr(MTRRcap_MSR, lo, dummy); @@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void) * \param changed pointer which indicates whether the MTRR needed to be changed * \param msrwords pointer to the MSR values which the MSR should have */ -static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) +static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { unsigned lo, hi; @@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); - *changed = TRUE; + *changed = true; } } @@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, static int set_fixed_ranges(mtrr_type * frs) { unsigned long long *saved = (unsigned long long *) frs; - int changed = FALSE; + bool changed = false; int block=-1, range; while (fixed_range_blocks[++block].ranges) @@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs) /* Set the MSR pair relating to a var range. Returns TRUE if changes are made */ -static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; - int changed = FALSE; + bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); - changed = TRUE; + changed = true; } rdmsr(MTRRphysMask_MSR(index), lo, hi); @@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); - changed = TRUE; + changed = true; } return changed; } @@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ - cr0 = read_cr0() | 0x40000000; /* set CD flag */ + cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); @@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, <base> The base address of the region. <size> The size of the region. If this is 0 the region is disabled. <type> The type of the region. - <do_safe> If TRUE, do the change safely. If FALSE, safety measures should - be done externally. [RETURNS] Nothing. */ { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index c7d8f1756745..91e150acb46c 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -11,10 +11,6 @@ #include <asm/mtrr.h> #include "mtrr.h" -/* RED-PEN: this is accessed without any locking */ -extern unsigned int *usage_table; - - #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = @@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x) static int mtrr_file_add(unsigned long base, unsigned long size, - unsigned int type, char increment, struct file *file, int page) + unsigned int type, bool increment, struct file *file, int page) { int reg, max; unsigned int *fcount = FILE_FCOUNT(file); @@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size, base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; } - reg = mtrr_add_page(base, size, type, 1); + reg = mtrr_add_page(base, size, type, true); if (reg >= 0) ++fcount[reg]; return reg; @@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size >>= PAGE_SHIFT; err = mtrr_add_page((unsigned long) base, (unsigned long) size, i, - 1); + true); if (err < 0) return err; return len; @@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = - mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 0); break; case MTRRIOC_SET_ENTRY: @@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); + err = mtrr_add(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_ENTRY: #ifdef CONFIG_COMPAT @@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = - mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: @@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); + err = + mtrr_add_page(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_PAGE_ENTRY: #ifdef CONFIG_COMPAT @@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); if (size == 0) - usage_table[i] = 0; + mtrr_usage_table[i] = 0; else { if (size < (0x100000 >> PAGE_SHIFT)) { /* less than 1MB */ @@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) len += seq_printf(seq, "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_attrib_to_str(type), usage_table[i]); + mtrr_attrib_to_str(type), mtrr_usage_table[i]); } } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 3b20613325dc..715919582657 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -38,8 +38,8 @@ #include <linux/cpu.h> #include <linux/mutex.h> +#include <asm/e820.h> #include <asm/mtrr.h> - #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> @@ -47,7 +47,7 @@ u32 num_var_ranges = 0; -unsigned int *usage_table; +unsigned int mtrr_usage_table[MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -121,13 +121,8 @@ static void __init init_table(void) int i, max; max = num_var_ranges; - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) - == NULL) { - printk(KERN_ERR "mtrr: could not allocate\n"); - return; - } for (i = 0; i < max; i++) - usage_table[i] = 1; + mtrr_usage_table[i] = 1; } struct set_mtrr_data { @@ -311,7 +306,7 @@ static void set_mtrr(unsigned int reg, unsigned long base, */ int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, char increment) + unsigned int type, bool increment) { int i, replace, error; mtrr_type ltype; @@ -349,7 +344,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, replace = -1; /* No CPU hotplug when we change MTRR entries */ - lock_cpu_hotplug(); + get_online_cpus(); /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { @@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, goto out; } if (increment) - ++usage_table[i]; + ++mtrr_usage_table[i]; error = i; goto out; } @@ -391,13 +386,15 @@ int mtrr_add_page(unsigned long base, unsigned long size, i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); - if (likely(replace < 0)) - usage_table[i] = 1; - else { - usage_table[i] = usage_table[replace] + !!increment; + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; if (unlikely(replace != i)) { set_mtrr(replace, 0, 0, 0); - usage_table[replace] = 0; + mtrr_usage_table[replace] = 0; } } } else @@ -405,7 +402,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; out: mutex_unlock(&mtrr_mutex); - unlock_cpu_hotplug(); + put_online_cpus(); return error; } @@ -460,7 +457,7 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, - char increment) + bool increment) { if (mtrr_check(base, size)) return -EINVAL; @@ -495,7 +492,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ - lock_cpu_hotplug(); + get_online_cpus(); mutex_lock(&mtrr_mutex); if (reg < 0) { /* Search for existing MTRR */ @@ -527,16 +524,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); goto out; } - if (usage_table[reg] < 1) { + if (mtrr_usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } - if (--usage_table[reg] < 1) + if (--mtrr_usage_table[reg] < 1) set_mtrr(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); - unlock_cpu_hotplug(); + put_online_cpus(); return error; } /** @@ -591,16 +588,11 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value * mtrr_state; +static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { int i; - int size = num_var_ranges * sizeof(struct mtrr_value); - - mtrr_state = kzalloc(size,GFP_ATOMIC); - if (!mtrr_state) - return -ENOMEM; for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, @@ -622,7 +614,6 @@ static int mtrr_restore(struct sys_device * sysdev) mtrr_state[i].lsize, mtrr_state[i].ltype); } - kfree(mtrr_state); return 0; } @@ -633,6 +624,112 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +static __init int amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_addr = 0, def, dummy; + mtrr_type type; + u64 trim_start, trim_size; + + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + if (amd_special_default_mtrr()) + return 0; + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_WRBACK) + continue; + base <<= PAGE_SHIFT; + size <<= PAGE_SHIFT; + if (highest_addr < base + size) + highest_addr = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + if (!highest_addr) { + printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + return 0; + } + + if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %LdMB of RAM.\n", + (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr\n"); + trim_start = highest_addr; + trim_size = end_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + add_memory_region(trim_start, trim_size, E820_RESERVED); + update_e820(); + return 1; + } + + return 0; +} /** * mtrr_bp_init - initialize mtrrs on the boot CPU diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 289dfe6030e3..fb74a2c20814 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -2,10 +2,8 @@ * local mtrr defines. */ -#ifndef TRUE -#define TRUE 1 -#define FALSE 0 -#endif +#include <linux/types.h> +#include <linux/stddef.h> #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff @@ -14,6 +12,7 @@ #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) #define NUM_FIXED_RANGES 88 +#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -34,6 +33,8 @@ an 8 bit field: */ typedef u8 mtrr_type; +extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; + struct mtrr_ops { u32 vendor; u32 use_intel_if; diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 49e20c2afcdf..9f8ba923d1c9 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -4,6 +4,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include "mtrr.h" @@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ - cr0 = read_cr0() | 0x40000000; + cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); wbinvd(); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index c02541e6e653..9b838324b818 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 3900e46d66db..028213260148 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -188,7 +188,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { +const struct seq_operations cpuinfo_op = { .start = c_start, .next = c_next, .stop = c_stop, diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 05c9936a16cc..dec66e452810 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -50,7 +50,7 @@ struct cpuid_command { static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; + struct cpuid_command *cmd = cmd_block; cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], &cmd->data[3]); @@ -157,15 +157,15 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: err = cpuid_device_create(cpu); break; case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: - case CPU_DEAD_FROZEN: cpuid_device_destroy(cpu); break; + case CPU_UP_CANCELED_FROZEN: + destroy_suspended_device(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + break; } return err ? NOTIFY_BAD : NOTIFY_OK; } diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 40978af630e7..a47798b59f07 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; static void doublefault_fn(void) { - struct Xgt_desc_struct gdt_desc = {0, 0}; + struct desc_ptr gdt_desc = {0, 0}; unsigned long gdt, tss; store_gdt(&gdt_desc); @@ -33,14 +33,15 @@ static void doublefault_fn(void) printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { - struct i386_hw_tss *t = (struct i386_hw_tss *)tss; + struct x86_hw_tss *t = (struct x86_hw_tss *)tss; - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); + printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", + t->ip, t->sp); printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->eax, t->ebx, t->ecx, t->edx); + t->ax, t->bx, t->cx, t->dx); printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + t->si, t->di); } } @@ -50,15 +51,15 @@ static void doublefault_fn(void) struct tss_struct doublefault_tss __cacheline_aligned = { .x86_tss = { - .esp0 = STACK_START, + .sp0 = STACK_START, .ss0 = __KERNEL_DS, .ldt = 0, .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .eip = (unsigned long) doublefault_fn, + .ip = (unsigned long) doublefault_fn, /* 0x2 bit is always set */ - .eflags = X86_EFLAGS_SF | 0x2, - .esp = STACK_START, + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c new file mode 100644 index 000000000000..1c5ca4d18787 --- /dev/null +++ b/arch/x86/kernel/ds.c @@ -0,0 +1,464 @@ +/* + * Debug Store support + * + * This provides a low-level interface to the hardware's Debug Store + * feature that is used for last branch recording (LBR) and + * precise-event based sampling (PEBS). + * + * Different architectures use a different DS layout/pointer size. + * The below functions therefore work on a void*. + * + * + * Since there is no user for PEBS, yet, only LBR (or branch + * trace store, BTS) is supported. + * + * + * Copyright (C) 2007 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + */ + +#include <asm/ds.h> + +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/slab.h> + + +/* + * Debug Store (DS) save area configuration (see Intel64 and IA32 + * Architectures Software Developer's Manual, section 18.5) + * + * The DS configuration consists of the following fields; different + * architetures vary in the size of those fields. + * - double-word aligned base linear address of the BTS buffer + * - write pointer into the BTS buffer + * - end linear address of the BTS buffer (one byte beyond the end of + * the buffer) + * - interrupt pointer into BTS buffer + * (interrupt occurs when write pointer passes interrupt pointer) + * - double-word aligned base linear address of the PEBS buffer + * - write pointer into the PEBS buffer + * - end linear address of the PEBS buffer (one byte beyond the end of + * the buffer) + * - interrupt pointer into PEBS buffer + * (interrupt occurs when write pointer passes interrupt pointer) + * - value to which counter is reset following counter overflow + * + * On later architectures, the last branch recording hardware uses + * 64bit pointers even in 32bit mode. + * + * + * Branch Trace Store (BTS) records store information about control + * flow changes. They at least provide the following information: + * - source linear address + * - destination linear address + * + * Netburst supported a predicated bit that had been dropped in later + * architectures. We do not suppor it. + * + * + * In order to abstract from the actual DS and BTS layout, we describe + * the access to the relevant fields. + * Thanks to Andi Kleen for proposing this design. + * + * The implementation, however, is not as general as it might seem. In + * order to stay somewhat simple and efficient, we assume an + * underlying unsigned type (mostly a pointer type) and we expect the + * field to be at least as big as that type. + */ + +/* + * A special from_ip address to indicate that the BTS record is an + * info record that needs to be interpreted or skipped. + */ +#define BTS_ESCAPE_ADDRESS (-1) + +/* + * A field access descriptor + */ +struct access_desc { + unsigned char offset; + unsigned char size; +}; + +/* + * The configuration for a particular DS/BTS hardware implementation. + */ +struct ds_configuration { + /* the DS configuration */ + unsigned char sizeof_ds; + struct access_desc bts_buffer_base; + struct access_desc bts_index; + struct access_desc bts_absolute_maximum; + struct access_desc bts_interrupt_threshold; + /* the BTS configuration */ + unsigned char sizeof_bts; + struct access_desc from_ip; + struct access_desc to_ip; + /* BTS variants used to store additional information like + timestamps */ + struct access_desc info_type; + struct access_desc info_data; + unsigned long debugctl_mask; +}; + +/* + * The global configuration used by the below accessor functions + */ +static struct ds_configuration ds_cfg; + +/* + * Accessor functions for some DS and BTS fields using the above + * global ptrace_bts_cfg. + */ +static inline unsigned long get_bts_buffer_base(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); +} +static inline void set_bts_buffer_base(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; +} +static inline unsigned long get_bts_index(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_index.offset); +} +static inline void set_bts_index(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; +} +static inline unsigned long get_bts_absolute_maximum(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); +} +static inline void set_bts_absolute_maximum(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; +} +static inline unsigned long get_bts_interrupt_threshold(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); +} +static inline void set_bts_interrupt_threshold(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; +} +static inline unsigned long get_from_ip(char *base) +{ + return *(unsigned long *)(base + ds_cfg.from_ip.offset); +} +static inline void set_from_ip(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; +} +static inline unsigned long get_to_ip(char *base) +{ + return *(unsigned long *)(base + ds_cfg.to_ip.offset); +} +static inline void set_to_ip(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; +} +static inline unsigned char get_info_type(char *base) +{ + return *(unsigned char *)(base + ds_cfg.info_type.offset); +} +static inline void set_info_type(char *base, unsigned char value) +{ + (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; +} +static inline unsigned long get_info_data(char *base) +{ + return *(unsigned long *)(base + ds_cfg.info_data.offset); +} +static inline void set_info_data(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; +} + + +int ds_allocate(void **dsp, size_t bts_size_in_bytes) +{ + size_t bts_size_in_records; + unsigned long bts; + void *ds; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (bts_size_in_bytes < 0) + return -EINVAL; + + bts_size_in_records = + bts_size_in_bytes / ds_cfg.sizeof_bts; + bts_size_in_bytes = + bts_size_in_records * ds_cfg.sizeof_bts; + + if (bts_size_in_bytes <= 0) + return -EINVAL; + + bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); + + if (!bts) + return -ENOMEM; + + ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + + if (!ds) { + kfree((void *)bts); + return -ENOMEM; + } + + set_bts_buffer_base(ds, bts); + set_bts_index(ds, bts); + set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); + set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); + + *dsp = ds; + return 0; +} + +int ds_free(void **dsp) +{ + if (*dsp) + kfree((void *)get_bts_buffer_base(*dsp)); + kfree(*dsp); + *dsp = 0; + + return 0; +} + +int ds_get_bts_size(void *ds) +{ + int size_in_bytes; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (!ds) + return 0; + + size_in_bytes = + get_bts_absolute_maximum(ds) - + get_bts_buffer_base(ds); + return size_in_bytes; +} + +int ds_get_bts_end(void *ds) +{ + int size_in_bytes = ds_get_bts_size(ds); + + if (size_in_bytes <= 0) + return size_in_bytes; + + return size_in_bytes / ds_cfg.sizeof_bts; +} + +int ds_get_bts_index(void *ds) +{ + int index_offset_in_bytes; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + index_offset_in_bytes = + get_bts_index(ds) - + get_bts_buffer_base(ds); + + return index_offset_in_bytes / ds_cfg.sizeof_bts; +} + +int ds_set_overflow(void *ds, int method) +{ + switch (method) { + case DS_O_SIGNAL: + return -EOPNOTSUPP; + case DS_O_WRAP: + return 0; + default: + return -EINVAL; + } +} + +int ds_get_overflow(void *ds) +{ + return DS_O_WRAP; +} + +int ds_clear(void *ds) +{ + int bts_size = ds_get_bts_size(ds); + unsigned long bts_base; + + if (bts_size <= 0) + return bts_size; + + bts_base = get_bts_buffer_base(ds); + memset((void *)bts_base, 0, bts_size); + + set_bts_index(ds, bts_base); + return 0; +} + +int ds_read_bts(void *ds, int index, struct bts_struct *out) +{ + void *bts; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (index < 0) + return -EINVAL; + + if (index >= ds_get_bts_size(ds)) + return -EINVAL; + + bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + + memset(out, 0, sizeof(*out)); + if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { + out->qualifier = get_info_type(bts); + out->variant.jiffies = get_info_data(bts); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = get_from_ip(bts); + out->variant.lbr.to_ip = get_to_ip(bts); + } + + return sizeof(*out);; +} + +int ds_write_bts(void *ds, const struct bts_struct *in) +{ + unsigned long bts; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (ds_get_bts_size(ds) <= 0) + return -ENXIO; + + bts = get_bts_index(ds); + + memset((void *)bts, 0, ds_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; + + case BTS_BRANCH: + set_from_ip((void *)bts, in->variant.lbr.from_ip); + set_to_ip((void *)bts, in->variant.lbr.to_ip); + break; + + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); + set_info_type((void *)bts, in->qualifier); + set_info_data((void *)bts, in->variant.jiffies); + break; + + default: + return -EINVAL; + } + + bts = bts + ds_cfg.sizeof_bts; + if (bts >= get_bts_absolute_maximum(ds)) + bts = get_bts_buffer_base(ds); + set_bts_index(ds, bts); + + return ds_cfg.sizeof_bts; +} + +unsigned long ds_debugctl_mask(void) +{ + return ds_cfg.debugctl_mask; +} + +#ifdef __i386__ +static const struct ds_configuration ds_cfg_netburst = { + .sizeof_ds = 9 * 4, + .bts_buffer_base = { 0, 4 }, + .bts_index = { 4, 4 }, + .bts_absolute_maximum = { 8, 4 }, + .bts_interrupt_threshold = { 12, 4 }, + .sizeof_bts = 3 * 4, + .from_ip = { 0, 4 }, + .to_ip = { 4, 4 }, + .info_type = { 4, 1 }, + .info_data = { 8, 4 }, + .debugctl_mask = (1<<2)|(1<<3) +}; + +static const struct ds_configuration ds_cfg_pentium_m = { + .sizeof_ds = 9 * 4, + .bts_buffer_base = { 0, 4 }, + .bts_index = { 4, 4 }, + .bts_absolute_maximum = { 8, 4 }, + .bts_interrupt_threshold = { 12, 4 }, + .sizeof_bts = 3 * 4, + .from_ip = { 0, 4 }, + .to_ip = { 4, 4 }, + .info_type = { 4, 1 }, + .info_data = { 8, 4 }, + .debugctl_mask = (1<<6)|(1<<7) +}; +#endif /* _i386_ */ + +static const struct ds_configuration ds_cfg_core2 = { + .sizeof_ds = 9 * 8, + .bts_buffer_base = { 0, 8 }, + .bts_index = { 8, 8 }, + .bts_absolute_maximum = { 16, 8 }, + .bts_interrupt_threshold = { 24, 8 }, + .sizeof_bts = 3 * 8, + .from_ip = { 0, 8 }, + .to_ip = { 8, 8 }, + .info_type = { 8, 1 }, + .info_data = { 16, 8 }, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void +ds_configure(const struct ds_configuration *cfg) +{ + ds_cfg = *cfg; +} + +void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { +#ifdef __i386__ + case 0xD: + case 0xE: /* Pentium M */ + ds_configure(&ds_cfg_pentium_m); + break; +#endif /* _i386_ */ + case 0xF: /* Core2 */ + ds_configure(&ds_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { +#ifdef __i386__ + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + ds_configure(&ds_cfg_netburst); + break; +#endif /* _i386_ */ + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 18f500d185a2..4e16ef4a2659 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -7,7 +7,6 @@ #include <linux/kexec.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/suspend.h> @@ -17,11 +16,6 @@ #include <asm/e820.h> #include <asm/setup.h> -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif - struct e820map e820; struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000; EXPORT_SYMBOL(pci_mem_start); #endif extern int user_defined_memmap; -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; static struct resource system_rom_resource = { .name = "System ROM", @@ -111,60 +85,6 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) @@ -260,10 +180,9 @@ static void __init probe_roms(void) * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) +void __init init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource, } } -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, - &data_resource, &bss_resource); - else - legacy_init_iomem_resources(&code_resource, - &data_resource, &bss_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} - -subsys_initcall(request_standard_resources); - #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) /** * e820_mark_nosave_regions - Find the ranges of physical addresses that do not @@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start, { int x; - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } + x = e820.nr_map; - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; } /* add_memory_region */ /* @@ -598,29 +486,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) } /* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - -/* * Find the highest page frame number we have available */ void __init find_max_pfn(void) @@ -628,11 +493,6 @@ void __init find_max_pfn(void) int i; max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; @@ -650,34 +510,12 @@ void __init find_max_pfn(void) } /* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* * Register fully available low RAM pages with the bootmem allocator. */ void __init register_bootmem_low_pages(unsigned long max_low_pfn) { int i; - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size; /* @@ -785,56 +623,12 @@ void __init print_memory_map(char *who) } } -static __init __always_inline void efi_limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - efi_memory_desc_t *md, *next_md; - void *p, *p1; - int i, j; - - j = 0; - p1 = memmap.map; - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - next_md = p1; - current_addr = md->phys_addr + - PFN_PHYS(md->num_pages); - if (is_available_memory(md)) { - if (md->phys_addr >= size) continue; - memcpy(next_md, md, memmap.desc_size); - if (current_addr >= size) { - next_md->num_pages -= - PFN_UP(current_addr-size); - } - p1 += memmap.desc_size; - next_md = p1; - j++; - } else if ((md->attribute & EFI_MEMORY_RUNTIME) == - EFI_MEMORY_RUNTIME) { - /* In order to make runtime services - * available we have to include runtime - * memory regions in memory map */ - memcpy(next_md, md, memmap.desc_size); - p1 += memmap.desc_size; - next_md = p1; - j++; - } - } - memmap.nr_map = j; - memmap.map_end = memmap.map + - (memmap.nr_map * memmap.desc_size); -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; int i; print_memory_map("limit_regions start"); - if (efi_enabled) { - efi_limit_regions(size); - return; - } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + print_memory_map("modified"); +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 04698e0b056c..c617174e8963 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -1,4 +1,4 @@ -/* +/* * Handle the memory map. * The functions here do the job until bootmem takes over. * @@ -26,80 +26,87 @@ #include <asm/proto.h> #include <asm/setup.h> #include <asm/sections.h> +#include <asm/kdebug.h> struct e820map e820; -/* +/* * PFN of last memory page. */ -unsigned long end_pfn; -EXPORT_SYMBOL(end_pfn); +unsigned long end_pfn; -/* +/* * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. * The direct mapping extends to end_pfn_map, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; + */ +unsigned long end_pfn_map; -/* +/* * Last pfn which the user wants to use. */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource, bss_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; - - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = PAGE_ALIGN(0x8000); - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image+ramdisk_size; - - if (last >= ramdisk_image && addr < ramdisk_end) { - *addrp = PAGE_ALIGN(ramdisk_end); - return 1; - } - } +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 + +struct early_res { + unsigned long start, end; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { + { 0, PAGE_SIZE }, /* BIOS data page */ +#ifdef CONFIG_SMP + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE }, #endif - /* kernel code */ - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); - return 1; + {} +}; + +void __init reserve_early(unsigned long start, unsigned long end) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %lx-%lx to %lx-%lx\n", + start, end, r->start, r->end); } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; +} - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); - return 1; +void __init early_res_to_bootmem(void) +{ + int i; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + reserve_bootmem_generic(r->start, r->end - r->start); } +} -#ifdef CONFIG_NUMA - /* NUMA memory to node map */ - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { - *addrp = nodemap_addr + nodemap_size; - return 1; +/* Check for already reserved areas */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + int i; + unsigned long addr = *addrp, last; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last >= r->start && addr < r->end) { + *addrp = addr = r->end; + changed = 1; + goto again; + } } -#endif - /* XXX ramdisk image here? */ - return 0; -} + return changed; +} /* * This function checks if any part of the range <start,end> is mapped @@ -107,16 +114,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) */ int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ +{ int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) continue; if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } + continue; + return 1; + } return 0; } EXPORT_SYMBOL_GPL(e820_any_mapped); @@ -127,11 +136,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped); * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +int __init e820_all_mapped(unsigned long start, unsigned long end, + unsigned type) { int i; + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) continue; /* is the region (part) in overlap with the current region ?*/ @@ -143,65 +155,73 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type */ if (ei->addr <= start) start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ + /* + * if start is now at or beyond end, we're done, full + * coverage + */ if (start >= end) - return 1; /* we're done */ + return 1; } return 0; } -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, + unsigned size) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + + if (ei->type != E820_RAM) + continue; + if (addr < start) addr = start; - if (addr > ei->addr + ei->size) - continue; + if (addr > ei->addr + ei->size) + continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; last = PAGE_ALIGN(addr) + size; if (last > ei->addr + ei->size) continue; - if (last > end) + if (last > end) continue; - return addr; - } - return -1UL; -} + return addr; + } + return -1UL; +} /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn = 0; + unsigned long end_pfn; + end_pfn = find_max_pfn_with_active_regions(); - - if (end_pfn > end_pfn_map) + + if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) end_pfn_map = MAXMEM>>PAGE_SHIFT; if (end_pfn > end_user_pfn) end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; - printk("end_pfn_map = %lu\n", end_pfn_map); - return end_pfn; + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); + return end_pfn; } /* * Mark e820 reserved areas as busy for the resource manager. */ -void __init e820_reserve_resources(void) +void __init e820_reserve_resources(struct resource *code_resource, + struct resource *data_resource, struct resource *bss_resource) { int i; for (i = 0; i < e820.nr_map; i++) { @@ -219,13 +239,13 @@ void __init e820_reserve_resources(void) request_resource(&iomem_resource, res); if (e820.map[i].type == E820_RAM) { /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. */ - request_resource(res, &code_resource); - request_resource(res, &data_resource); - request_resource(res, &bss_resource); + request_resource(res, code_resource); + request_resource(res, data_resource); + request_resource(res, bss_resource); #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) request_resource(res, &crashk_res); @@ -322,9 +342,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn, add_active_range(nid, ei_startpfn, ei_endpfn); } -/* +/* * Add a memory region to the kernel e820 map. - */ + */ void __init add_memory_region(unsigned long start, unsigned long size, int type) { int x = e820.nr_map; @@ -349,9 +369,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn; - unsigned long ei_endpfn; - unsigned long ram = 0; + unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { @@ -363,28 +381,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) return end - start - (ram << PAGE_SHIFT); } -void __init e820_print_map(char *who) +static void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; case E820_RESERVED: - printk("(reserved)\n"); - break; + printk(KERN_CONT "(reserved)\n"); + break; case E820_ACPI: - printk("(ACPI data)\n"); - break; + printk(KERN_CONT "(ACPI data)\n"); + break; case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; } } } @@ -392,11 +413,11 @@ void __init e820_print_map(char *who) /* * Sanitize the BIOS e820 map. * - * Some e820 responses include overlapping entries. The following + * Some e820 responses include overlapping entries. The following * replaces the original e820 map with a new one, removing overlaps. * */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -416,7 +437,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) int i; /* - Visually we're performing the following (1,2,3,4 = memory types)... + Visually we're performing the following + (1,2,3,4 = memory types)... Sample memory map (w/overlaps): ____22__________________ @@ -458,22 +480,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) old_nr = *pnr_map; /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) + for (i = 0; i < old_nr; i++) if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) return -1; /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) + for (i = 0; i < 2 * old_nr; i++) change_point[i] = &change_point_list[i]; /* record all known change-points (starting and ending addresses), omitting those that are for empty memory regions */ chgidx = 0; - for (i=0; i < old_nr; i++) { + for (i = 0; i < old_nr; i++) { if (biosmap[i].size != 0) { change_point[chgidx]->addr = biosmap[i].addr; change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; change_point[chgidx++]->pbios = &biosmap[i]; } } @@ -483,75 +506,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) still_changing = 1; while (still_changing) { still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { change_tmp = change_point[i]; change_point[i] = change_point[i-1]; change_point[i-1] = change_tmp; - still_changing=1; + still_changing = 1; } } } /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { + for (chgidx = 0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; } overlap_entries--; } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ current_type = 0; - for (i=0; i<overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ + /* + * continue building up new bios map based on this + * information + */ if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ + /* + * move forward only if the new size + * was non-zero + */ if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ + break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; + last_addr = change_point[chgidx]->addr; } last_type = current_type; } } - new_nr = new_bios_entry; /* retain count for new bios entries */ + /* retain count for new bios entries */ + new_nr = new_bios_entry; /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); *pnr_map = new_nr; return 0; @@ -566,7 +620,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) @@ -583,18 +637,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return -1; add_memory_region(start, size, type); - } while (biosmap++,--nr_map); + } while (biosmap++, --nr_map); return 0; } -void early_panic(char *msg) +static void early_panic(char *msg) { early_printk(msg); panic(msg); } -void __init setup_memory_region(void) +/* We're not void only for x86 32-bit compat */ +char * __init machine_specific_memory_setup(void) { + char *who = "BIOS-e820"; /* * Try to copy the BIOS-supplied E820-map. * @@ -605,7 +661,10 @@ void __init setup_memory_region(void) if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("BIOS-e820"); + e820_print_map(who); + + /* In case someone cares... */ + return who; } static int __init parse_memopt(char *p) @@ -613,9 +672,9 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; + end_user_pfn >>= PAGE_SHIFT; return 0; -} +} early_param("mem", parse_memopt); static int userdef __initdata; @@ -627,9 +686,9 @@ static int __init parse_memmap_opt(char *p) if (!strcmp(p, "exactmap")) { #ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is * reset. */ e820_register_active_regions(0, 0, -1UL); @@ -646,6 +705,8 @@ static int __init parse_memmap_opt(char *p) mem_size = memparse(p, &p); if (p == oldp) return -EINVAL; + + userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); @@ -665,11 +726,29 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { + char nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + printk(KERN_INFO "user-defined physical RAM map:\n"); e820_print_map("user"); } } +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} + unsigned long pci_mem_start = 0xaeedbabe; EXPORT_SYMBOL(pci_mem_start); @@ -713,8 +792,10 @@ __init void e820_setup_gap(void) if (!found) { gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); } /* @@ -727,8 +808,9 @@ __init void e820_setup_gap(void) /* Fun with two's complement */ pci_mem_start = (gapstart + round) & -round; - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); } int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 88bb83ec895f..9f51e1ea9e82 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -21,7 +21,33 @@ #include <asm/gart.h> #endif -static void __init via_bugs(void) +static void __init fix_hypertransport_config(int num, int slot, int func) +{ + u32 htcfg; + /* + * we found a hypertransport bus + * make sure that we are broadcasting + * interrupts to all cpus on the ht bus + * if we're using extended apic ids + */ + htcfg = read_pci_config(num, slot, func, 0x68); + if (htcfg & (1 << 18)) { + printk(KERN_INFO "Detected use of extended apic ids " + "on hypertransport bus\n"); + if ((htcfg & (1 << 17)) == 0) { + printk(KERN_INFO "Enabling hypertransport extended " + "apic interrupt broadcast\n"); + printk(KERN_INFO "Note this is a bios bug, " + "please contact your hw vendor\n"); + htcfg |= (1 << 17); + write_pci_config(num, slot, func, 0x68, htcfg); + } + } + + +} + +static void __init via_bugs(int num, int slot, int func) { #ifdef CONFIG_GART_IOMMU if ((end_pfn > MAX_DMA32_PFN || force_iommu) && @@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header) #endif /* CONFIG_X86_IO_APIC */ #endif /* CONFIG_ACPI */ -static void __init nvidia_bugs(void) +static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC @@ -72,7 +98,7 @@ static void __init nvidia_bugs(void) } -static void __init ati_bugs(void) +static void __init ati_bugs(int num, int slot, int func) { #ifdef CONFIG_X86_IO_APIC if (timer_over_8254 == 1) { @@ -83,18 +109,67 @@ static void __init ati_bugs(void) #endif } +#define QFLAG_APPLY_ONCE 0x1 +#define QFLAG_APPLIED 0x2 +#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) struct chipset { - u16 vendor; - void (*f)(void); + u32 vendor; + u32 device; + u32 class; + u32 class_mask; + u32 flags; + void (*f)(int num, int slot, int func); }; static struct chipset early_qrk[] __initdata = { - { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, - { PCI_VENDOR_ID_VIA, via_bugs }, - { PCI_VENDOR_ID_ATI, ati_bugs }, + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, + { PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, {} }; +static void __init check_dev_quirk(int num, int slot, int func) +{ + u16 class; + u16 vendor; + u16 device; + u8 type; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); + + if (class == 0xffff) + return; + + vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; early_qrk[i].f != NULL; i++) { + if (((early_qrk[i].vendor == PCI_ANY_ID) || + (early_qrk[i].vendor == vendor)) && + ((early_qrk[i].device == PCI_ANY_ID) || + (early_qrk[i].device == device)) && + (!((early_qrk[i].class ^ class) & + early_qrk[i].class_mask))) { + if ((early_qrk[i].flags & + QFLAG_DONE) != QFLAG_DONE) + early_qrk[i].f(num, slot, func); + early_qrk[i].flags |= QFLAG_APPLIED; + } + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + return; +} + void __init early_quirks(void) { int num, slot, func; @@ -103,36 +178,8 @@ void __init early_quirks(void) return; /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - int i; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - - for (i = 0; early_qrk[i].f; i++) - if (early_qrk[i].vendor == vendor) { - early_qrk[i].f(); - return; - } - - type = read_pci_config_byte(num, slot, func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } + for (num = 0; num < 32; num++) + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) + check_dev_quirk(num, slot, func); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c new file mode 100644 index 000000000000..1411324a625c --- /dev/null +++ b/arch/x86/kernel/efi.c @@ -0,0 +1,512 @@ +/* + * Common EFI (Extensible Firmware Interface) support functions + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * + * Copied from efi_32.c to eliminate the duplicated code between EFI + * 32/64 support code. --ying 2007-10-26 + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: <goutham.rao@intel.com> + * Skip non-WB memory and ignore empty memory ranges. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/efi.h> +#include <linux/bootmem.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/time.h> +#include <linux/io.h> +#include <linux/reboot.h> +#include <linux/bcd.h> + +#include <asm/setup.h> +#include <asm/efi.h> +#include <asm/time.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#define EFI_DEBUG 1 +#define PFX "EFI: " + +int efi_enabled; +EXPORT_SYMBOL(efi_enabled); + +struct efi efi; +EXPORT_SYMBOL(efi); + +struct efi_memory_map memmap; + +struct efi efi_phys __initdata; +static efi_system_table_t efi_systab __initdata; + +static int __init setup_noefi(char *arg) +{ + efi_enabled = 0; + return 0; +} +early_param("noefi", setup_noefi); + +static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + return efi_call_virt2(get_time, tm, tc); +} + +static efi_status_t virt_efi_set_time(efi_time_t *tm) +{ + return efi_call_virt1(set_time, tm); +} + +static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + return efi_call_virt3(get_wakeup_time, + enabled, pending, tm); +} + +static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +{ + return efi_call_virt2(set_wakeup_time, + enabled, tm); +} + +static efi_status_t virt_efi_get_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 *attr, + unsigned long *data_size, + void *data) +{ + return efi_call_virt5(get_variable, + name, vendor, attr, + data_size, data); +} + +static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + return efi_call_virt3(get_next_variable, + name_size, name, vendor); +} + +static efi_status_t virt_efi_set_variable(efi_char16_t *name, + efi_guid_t *vendor, + unsigned long attr, + unsigned long data_size, + void *data) +{ + return efi_call_virt5(set_variable, + name, vendor, attr, + data_size, data); +} + +static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) +{ + return efi_call_virt1(get_next_high_mono_count, count); +} + +static void virt_efi_reset_system(int reset_type, + efi_status_t status, + unsigned long data_size, + efi_char16_t *data) +{ + efi_call_virt4(reset_system, reset_type, status, + data_size, data); +} + +static efi_status_t virt_efi_set_virtual_address_map( + unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + return efi_call_virt4(set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); +} + +static efi_status_t __init phys_efi_set_virtual_address_map( + unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys4(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); + efi_call_phys_epilog(); + return status; +} + +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, tm, tc); + efi_call_phys_epilog(); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) +{ + int real_seconds, real_minutes; + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "Oops: efitime: can't read time!\n"); + return -1; + } + + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + if (((abs(real_minutes - eft.minute) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + eft.minute = real_minutes; + eft.second = real_seconds; + + status = efi.set_time(&eft); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "Oops: efitime: can't write time!\n"); + return -1; + } + return 0; +} + +unsigned long efi_get_time(void) +{ + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + if (status != EFI_SUCCESS) + printk(KERN_ERR "Oops: efitime: can't read time!\n"); + + return mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); +} + +#if EFI_DEBUG +static void __init print_efi_memmap(void) +{ + efi_memory_desc_t *md; + void *p; + int i; + + for (p = memmap.map, i = 0; + p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, " + "range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} +#endif /* EFI_DEBUG */ + +void __init efi_init(void) +{ + efi_config_table_t *config_tables; + efi_runtime_services_t *runtime; + efi_char16_t *c16; + char vendor[100] = "unknown"; + int i = 0; + void *tmp; + +#ifdef CONFIG_X86_32 + efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; + memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; +#else + efi_phys.systab = (efi_system_table_t *) + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + memmap.phys_map = (void *) + (boot_params.efi_info.efi_memmap | + ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); +#endif + memmap.nr_map = boot_params.efi_info.efi_memmap_size / + boot_params.efi_info.efi_memdesc_size; + memmap.desc_version = boot_params.efi_info.efi_memdesc_version; + memmap.desc_size = boot_params.efi_info.efi_memdesc_size; + + efi.systab = early_ioremap((unsigned long)efi_phys.systab, + sizeof(efi_system_table_t)); + if (efi.systab == NULL) + printk(KERN_ERR "Couldn't map the EFI system table!\n"); + memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); + early_iounmap(efi.systab, sizeof(efi_system_table_t)); + efi.systab = &efi_systab; + + /* + * Verify the EFI Table + */ + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + printk(KERN_ERR "EFI system table signature incorrect!\n"); + if ((efi.systab->hdr.revision >> 16) == 0) + printk(KERN_ERR "Warning: EFI system table version " + "%d.%02d, expected 1.00 or greater!\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* + * Show what we know for posterity + */ + c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); + if (c16) { + for (i = 0; i < sizeof(vendor) && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } else + printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + early_iounmap(tmp, 2); + + printk(KERN_INFO "EFI v%u.%.02u by %s \n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + /* + * Let's see what config tables the firmware passed to us. + */ + config_tables = early_ioremap( + efi.systab->tables, + efi.systab->nr_tables * sizeof(efi_config_table_t)); + if (config_tables == NULL) + printk(KERN_ERR "Could not map EFI Configuration Table!\n"); + + printk(KERN_INFO); + for (i = 0; i < efi.systab->nr_tables; i++) { + if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { + efi.mps = config_tables[i].table; + printk(" MPS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + ACPI_20_TABLE_GUID)) { + efi.acpi20 = config_tables[i].table; + printk(" ACPI 2.0=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + ACPI_TABLE_GUID)) { + efi.acpi = config_tables[i].table; + printk(" ACPI=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + SMBIOS_TABLE_GUID)) { + efi.smbios = config_tables[i].table; + printk(" SMBIOS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + HCDP_TABLE_GUID)) { + efi.hcdp = config_tables[i].table; + printk(" HCDP=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + UGA_IO_PROTOCOL_GUID)) { + efi.uga = config_tables[i].table; + printk(" UGA=0x%lx ", config_tables[i].table); + } + } + printk("\n"); + early_iounmap(config_tables, + efi.systab->nr_tables * sizeof(efi_config_table_t)); + + /* + * Check out the runtime services table. We need to map + * the runtime services table so that we can grab the physical + * address of several of the EFI runtime functions, needed to + * set the firmware into virtual mode. + */ + runtime = early_ioremap((unsigned long)efi.systab->runtime, + sizeof(efi_runtime_services_t)); + if (runtime != NULL) { + /* + * We will only need *early* access to the following + * two EFI runtime services before set_virtual_address_map + * is invoked. + */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; + efi_phys.set_virtual_address_map = + (efi_set_virtual_address_map_t *) + runtime->set_virtual_address_map; + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; + } else + printk(KERN_ERR "Could not map the EFI runtime service " + "table!\n"); + early_iounmap(runtime, sizeof(efi_runtime_services_t)); + + /* Map the EFI memory map */ + memmap.map = early_ioremap((unsigned long)memmap.phys_map, + memmap.nr_map * memmap.desc_size); + if (memmap.map == NULL) + printk(KERN_ERR "Could not map the EFI memory map!\n"); + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + if (memmap.desc_size != sizeof(efi_memory_desc_t)) + printk(KERN_WARNING "Kernel-defined memdesc" + "doesn't match the one from EFI!\n"); + + /* Setup for EFI runtime service */ + reboot_type = BOOT_EFI; + +#if EFI_DEBUG + print_efi_memmap(); +#endif +} + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static void __init runtime_code_page_mkexec(void) +{ + efi_memory_desc_t *md; + unsigned long end; + void *p; + + if (!(__supported_pte_mask & _PAGE_NX)) + return; + + /* Make EFI runtime service code area executable */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + if (md->type == EFI_RUNTIME_SERVICES_CODE && + (end >> PAGE_SHIFT) <= max_pfn_mapped) { + set_memory_x(md->virt_addr, md->num_pages); + set_memory_uc(md->virt_addr, md->num_pages); + } + } + __flush_tlb_all(); +} +#else +static inline void __init runtime_code_page_mkexec(void) { } +#endif + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor and update + * that memory descriptor with the virtual address obtained from ioremap(). + * This enables the runtime services to be called without having to + * thunk back into physical mode for every invocation. + */ +void __init efi_enter_virtual_mode(void) +{ + efi_memory_desc_t *md; + efi_status_t status; + unsigned long end; + void *p; + + efi.systab = NULL; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + if ((md->attribute & EFI_MEMORY_WB) && + ((end >> PAGE_SHIFT) <= max_pfn_mapped)) + md->virt_addr = (unsigned long)__va(md->phys_addr); + else + md->virt_addr = (unsigned long) + efi_ioremap(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT); + if (!md->virt_addr) + printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", + (unsigned long long)md->phys_addr); + if ((md->phys_addr <= (unsigned long)efi_phys.systab) && + ((unsigned long)efi_phys.systab < end)) + efi.systab = (efi_system_table_t *)(unsigned long) + (md->virt_addr - md->phys_addr + + (unsigned long)efi_phys.systab); + } + + BUG_ON(!efi.systab); + + status = phys_efi_set_virtual_address_map( + memmap.desc_size * memmap.nr_map, + memmap.desc_size, + memmap.desc_version, + memmap.phys_map); + + if (status != EFI_SUCCESS) { + printk(KERN_ALERT "Unable to switch EFI into virtual mode " + "(status=%lx)!\n", status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } + + /* + * Now that EFI is in virtual mode, update the function + * pointers in the runtime service table to the new virtual addresses. + * + * Call EFI services through wrapper functions. + */ + efi.get_time = virt_efi_get_time; + efi.set_time = virt_efi_set_time; + efi.get_wakeup_time = virt_efi_get_wakeup_time; + efi.set_wakeup_time = virt_efi_set_wakeup_time; + efi.get_variable = virt_efi_get_variable; + efi.get_next_variable = virt_efi_get_next_variable; + efi.set_variable = virt_efi_set_variable; + efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; + efi.reset_system = virt_efi_reset_system; + efi.set_virtual_address_map = virt_efi_set_virtual_address_map; + runtime_code_page_mkexec(); + early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + memmap.map = NULL; +} + +/* + * Convenience functions to obtain memory types and attributes + */ +u32 efi_mem_type(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->type; + } + return 0; +} + +u64 efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->attribute; + } + return 0; +} diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index e2be78f49399..cb91f985b4a1 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -20,40 +20,15 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mm.h> #include <linux/types.h> -#include <linux/time.h> -#include <linux/spinlock.h> -#include <linux/bootmem.h> #include <linux/ioport.h> -#include <linux/module.h> #include <linux/efi.h> -#include <linux/kexec.h> -#include <asm/setup.h> #include <asm/io.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> #include <asm/tlbflush.h> -#define EFI_DEBUG 0 -#define PFX "EFI: " - -extern efi_status_t asmlinkage efi_call_phys(void *, ...); - -struct efi efi; -EXPORT_SYMBOL(efi); -static struct efi efi_phys; -struct efi_memory_map memmap; - -/* - * We require an early boot_ioremap mapping mechanism initially - */ -extern void * boot_ioremap(unsigned long, unsigned long); - /* * To make EFI call EFI runtime service in physical addressing mode we need * prelog/epilog before/after the invocation to disable interrupt, to @@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long); */ static unsigned long efi_rt_eflags; -static DEFINE_SPINLOCK(efi_rt_lock); static pgd_t efi_bak_pg_dir_pointer[2]; -static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) +void efi_call_phys_prelog(void) { unsigned long cr4; unsigned long temp; - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; - spin_lock(&efi_rt_lock); local_irq_save(efi_rt_eflags); /* @@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) /* * After the lock is released, the original page table is restored. */ - local_flush_tlb(); + __flush_tlb_all(); gdt_descr.address = __pa(get_cpu_gdt_table(0)); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } -static void efi_call_phys_epilog(void) __releases(efi_rt_lock) +void efi_call_phys_epilog(void) { unsigned long cr4; - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); gdt_descr.size = GDT_SIZE - 1; @@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock) /* * After the lock is released, the original page table is restored. */ - local_flush_tlb(); + __flush_tlb_all(); local_irq_restore(efi_rt_eflags); - spin_unlock(&efi_rt_lock); -} - -static efi_status_t -phys_efi_set_virtual_address_map(unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); - efi_call_phys_epilog(); - return status; -} - -static efi_status_t -phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys(efi_phys.get_time, tm, tc); - efi_call_phys_epilog(); - return status; -} - -inline int efi_set_rtc_mmss(unsigned long nowtime) -{ - int real_seconds, real_minutes; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - spin_lock(&efi_rt_lock); - status = efi.get_time(&eft, &cap); - spin_unlock(&efi_rt_lock); - if (status != EFI_SUCCESS) - panic("Ooops, efitime: can't read time!\n"); - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - - if (((abs(real_minutes - eft.minute) + 15)/30) & 1) - real_minutes += 30; - real_minutes %= 60; - - eft.minute = real_minutes; - eft.second = real_seconds; - - if (status != EFI_SUCCESS) { - printk("Ooops: efitime: can't read time!\n"); - return -1; - } - return 0; -} -/* - * This is used during kernel init before runtime - * services have been remapped and also during suspend, therefore, - * we'll need to call both in physical and virtual modes. - */ -inline unsigned long efi_get_time(void) -{ - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - if (efi.get_time) { - /* if we are in virtual mode use remapped function */ - status = efi.get_time(&eft, &cap); - } else { - /* we are in physical mode */ - status = phys_efi_get_time(&eft, &cap); - } - - if (status != EFI_SUCCESS) - printk("Oops: efitime: can't read time status: 0x%lx\n",status); - - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); -} - -int is_available_memory(efi_memory_desc_t * md) -{ - if (!(md->attribute & EFI_MEMORY_WB)) - return 0; - - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - return 1; - } - return 0; -} - -/* - * We need to map the EFI memory map again after paging_init(). - */ -void __init efi_map_memmap(void) -{ - memmap.map = NULL; - - memmap.map = bt_ioremap((unsigned long) memmap.phys_map, - (memmap.nr_map * memmap.desc_size)); - if (memmap.map == NULL) - printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); - - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); -} - -#if EFI_DEBUG -static void __init print_efi_memmap(void) -{ - efi_memory_desc_t *md; - void *p; - int i; - - for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " - "range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} -#endif /* EFI_DEBUG */ - -/* - * Walks the EFI memory map and calls CALLBACK once for each EFI - * memory descriptor that has memory that is available for kernel use. - */ -void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) -{ - int prev_valid = 0; - struct range { - unsigned long start; - unsigned long end; - } uninitialized_var(prev), curr; - efi_memory_desc_t *md; - unsigned long start, end; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if ((md->num_pages == 0) || (!is_available_memory(md))) - continue; - - curr.start = md->phys_addr; - curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); - - if (!prev_valid) { - prev = curr; - prev_valid = 1; - } else { - if (curr.start < prev.start) - printk(KERN_INFO PFX "Unordered memory map\n"); - if (prev.end == curr.start) - prev.end = curr.end; - else { - start = - (unsigned long) (PAGE_ALIGN(prev.start)); - end = (unsigned long) (prev.end & PAGE_MASK); - if ((end > start) - && (*callback) (start, end, arg) < 0) - return; - prev = curr; - } - } - } - if (prev_valid) { - start = (unsigned long) PAGE_ALIGN(prev.start); - end = (unsigned long) (prev.end & PAGE_MASK); - if (end > start) - (*callback) (start, end, arg); - } -} - -void __init efi_init(void) -{ - efi_config_table_t *config_tables; - efi_runtime_services_t *runtime; - efi_char16_t *c16; - char vendor[100] = "unknown"; - unsigned long num_config_tables; - int i = 0; - - memset(&efi, 0, sizeof(efi) ); - memset(&efi_phys, 0, sizeof(efi_phys)); - - efi_phys.systab = - (efi_system_table_t *)boot_params.efi_info.efi_systab; - memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; - memmap.nr_map = boot_params.efi_info.efi_memmap_size/ - boot_params.efi_info.efi_memdesc_size; - memmap.desc_version = boot_params.efi_info.efi_memdesc_version; - memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - - efi.systab = (efi_system_table_t *) - boot_ioremap((unsigned long) efi_phys.systab, - sizeof(efi_system_table_t)); - /* - * Verify the EFI Table - */ - if (efi.systab == NULL) - printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n"); - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n"); - if ((efi.systab->hdr.revision >> 16) == 0) - printk(KERN_ERR PFX "Warning: EFI system table version " - "%d.%02d, expected 1.00 or greater\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* - * Grab some details from the system table - */ - num_config_tables = efi.systab->nr_tables; - config_tables = (efi_config_table_t *)efi.systab->tables; - runtime = efi.systab->runtime; - - /* - * Show what we know for posterity - */ - c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2); - if (c16) { - for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i) - vendor[i] = *c16++; - vendor[i] = '\0'; - } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); - - printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - /* - * Let's see what config tables the firmware passed to us. - */ - config_tables = (efi_config_table_t *) - boot_ioremap((unsigned long) config_tables, - num_config_tables * sizeof(efi_config_table_t)); - - if (config_tables == NULL) - printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n"); - - efi.mps = EFI_INVALID_TABLE_ADDR; - efi.acpi = EFI_INVALID_TABLE_ADDR; - efi.acpi20 = EFI_INVALID_TABLE_ADDR; - efi.smbios = EFI_INVALID_TABLE_ADDR; - efi.sal_systab = EFI_INVALID_TABLE_ADDR; - efi.boot_info = EFI_INVALID_TABLE_ADDR; - efi.hcdp = EFI_INVALID_TABLE_ADDR; - efi.uga = EFI_INVALID_TABLE_ADDR; - - for (i = 0; i < num_config_tables; i++) { - if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { - efi.mps = config_tables[i].table; - printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { - efi.acpi20 = config_tables[i].table; - printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { - efi.acpi = config_tables[i].table; - printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { - efi.smbios = config_tables[i].table; - printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { - efi.hcdp = config_tables[i].table; - printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) { - efi.uga = config_tables[i].table; - printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table); - } - } - printk("\n"); - - /* - * Check out the runtime services table. We need to map - * the runtime services table so that we can grab the physical - * address of several of the EFI runtime functions, needed to - * set the firmware into virtual mode. - */ - - runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long) - runtime, - sizeof(efi_runtime_services_t)); - if (runtime != NULL) { - /* - * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map - * is invoked. - */ - efi_phys.get_time = (efi_get_time_t *) runtime->get_time; - efi_phys.set_virtual_address_map = - (efi_set_virtual_address_map_t *) - runtime->set_virtual_address_map; - } else - printk(KERN_ERR PFX "Could not map the runtime service table!\n"); - - /* Map the EFI memory map for use until paging_init() */ - memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap, - boot_params.efi_info.efi_memmap_size); - if (memmap.map == NULL) - printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); - - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - -#if EFI_DEBUG - print_efi_memmap(); -#endif -} - -static inline void __init check_range_for_systab(efi_memory_desc_t *md) -{ - if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < md->phys_addr + - ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) { - unsigned long addr; - - addr = md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab; - efi.systab = (efi_system_table_t *)addr; - } -} - -/* - * Wrap all the virtual calls in a way that forces the parameters on the stack. - */ - -#define efi_call_virt(f, args...) \ - ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - return efi_call_virt(get_time, tm, tc); -} - -static efi_status_t virt_efi_set_time (efi_time_t *tm) -{ - return efi_call_virt(set_time, tm); -} - -static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - return efi_call_virt(get_wakeup_time, enabled, pending, tm); -} - -static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled, - efi_time_t *tm) -{ - return efi_call_virt(set_wakeup_time, enabled, tm); -} - -static efi_status_t virt_efi_get_variable (efi_char16_t *name, - efi_guid_t *vendor, u32 *attr, - unsigned long *data_size, void *data) -{ - return efi_call_virt(get_variable, name, vendor, attr, data_size, data); -} - -static efi_status_t virt_efi_get_next_variable (unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt(get_next_variable, name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable (efi_char16_t *name, - efi_guid_t *vendor, - unsigned long attr, - unsigned long data_size, void *data) -{ - return efi_call_virt(set_variable, name, vendor, attr, data_size, data); -} - -static efi_status_t virt_efi_get_next_high_mono_count (u32 *count) -{ - return efi_call_virt(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system (int reset_type, efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - efi_call_virt(reset_system, reset_type, status, data_size, data); -} - -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to - * thunk back into physical mode for every invocation. - */ - -void __init efi_enter_virtual_mode(void) -{ - efi_memory_desc_t *md; - efi_status_t status; - void *p; - - efi.systab = NULL; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - - md->virt_addr = (unsigned long)ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!(unsigned long)md->virt_addr) { - printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", - (unsigned long)md->phys_addr); - } - /* update the virtual address of the EFI system table */ - check_range_for_systab(md); - } - - BUG_ON(!efi.systab); - - status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, - memmap.desc_size, - memmap.desc_version, - memmap.phys_map); - - if (status != EFI_SUCCESS) { - printk (KERN_ALERT "You are screwed! " - "Unable to switch EFI into virtual mode " - "(status=%lx)\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); - } - - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - */ - - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; -} - -void __init -efi_initialize_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) -{ - struct resource *res; - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > - 0x100000000ULL) - continue; - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (md->type) { - case EFI_RESERVED_TYPE: - res->name = "Reserved Memory"; - break; - case EFI_LOADER_CODE: - res->name = "Loader Code"; - break; - case EFI_LOADER_DATA: - res->name = "Loader Data"; - break; - case EFI_BOOT_SERVICES_DATA: - res->name = "BootServices Data"; - break; - case EFI_BOOT_SERVICES_CODE: - res->name = "BootServices Code"; - break; - case EFI_RUNTIME_SERVICES_CODE: - res->name = "Runtime Service Code"; - break; - case EFI_RUNTIME_SERVICES_DATA: - res->name = "Runtime Service Data"; - break; - case EFI_CONVENTIONAL_MEMORY: - res->name = "Conventional Memory"; - break; - case EFI_UNUSABLE_MEMORY: - res->name = "Unusable Memory"; - break; - case EFI_ACPI_RECLAIM_MEMORY: - res->name = "ACPI Reclaim"; - break; - case EFI_ACPI_MEMORY_NVS: - res->name = "ACPI NVS"; - break; - case EFI_MEMORY_MAPPED_IO: - res->name = "Memory Mapped IO"; - break; - case EFI_MEMORY_MAPPED_IO_PORT_SPACE: - res->name = "Memory Mapped IO Port Space"; - break; - default: - res->name = "Reserved"; - break; - } - res->start = md->phys_addr; - res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res) < 0) - printk(KERN_ERR PFX "Failed to allocate res %s : " - "0x%llx-0x%llx\n", res->name, - (unsigned long long)res->start, - (unsigned long long)res->end); - /* - * We don't know which region contains kernel data so we try - * it repeatedly and let the resource manager test it. - */ - if (md->type == EFI_CONVENTIONAL_MEMORY) { - request_resource(res, code_resource); - request_resource(res, data_resource); - request_resource(res, bss_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Convenience functions to obtain memory types and attributes - */ - -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && (phys_addr < - (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) - return md->type; - } - return 0; -} - -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && (phys_addr < - (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) - return md->attribute; - } - return 0; } diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c new file mode 100644 index 000000000000..4b73992c1e11 --- /dev/null +++ b/arch/x86/kernel/efi_64.c @@ -0,0 +1,134 @@ +/* + * x86_64 specific EFI support functions + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * + * Code to convert EFI to E820 map has been implemented in elilo bootloader + * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table + * is setup appropriately for EFI runtime code. + * - mouli 06/14/2007. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/reboot.h> + +#include <asm/setup.h> +#include <asm/page.h> +#include <asm/e820.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm/efi.h> + +static pgd_t save_pgd __initdata; +static unsigned long efi_flags __initdata; + +static void __init early_mapping_set_exec(unsigned long start, + unsigned long end, + int executable) +{ + pte_t *kpte; + int level; + + while (start < end) { + kpte = lookup_address((unsigned long)__va(start), &level); + BUG_ON(!kpte); + if (executable) + set_pte(kpte, pte_mkexec(*kpte)); + else + set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ + __supported_pte_mask)); + if (level == 4) + start = (start + PMD_SIZE) & PMD_MASK; + else + start = (start + PAGE_SIZE) & PAGE_MASK; + } +} + +static void __init early_runtime_code_mapping_set_exec(int executable) +{ + efi_memory_desc_t *md; + void *p; + + if (!(__supported_pte_mask & _PAGE_NX)) + return; + + /* Make EFI runtime service code area executable */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (md->type == EFI_RUNTIME_SERVICES_CODE) { + unsigned long end; + end = md->phys_addr + (md->num_pages << PAGE_SHIFT); + early_mapping_set_exec(md->phys_addr, end, executable); + } + } +} + +void __init efi_call_phys_prelog(void) +{ + unsigned long vaddress; + + local_irq_save(efi_flags); + early_runtime_code_mapping_set_exec(1); + vaddress = (unsigned long)__va(0x0UL); + save_pgd = *pgd_offset_k(0x0UL); + set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); + __flush_tlb_all(); +} + +void __init efi_call_phys_epilog(void) +{ + /* + * After the lock is released, the original page table is restored. + */ + set_pgd(pgd_offset_k(0x0UL), save_pgd); + early_runtime_code_mapping_set_exec(0); + __flush_tlb_all(); + local_irq_restore(efi_flags); +} + +void __init efi_reserve_bootmem(void) +{ + reserve_bootmem_generic((unsigned long)memmap.phys_map, + memmap.nr_map * memmap.desc_size); +} + +void __iomem * __init efi_ioremap(unsigned long offset, + unsigned long size) +{ + static unsigned pages_mapped; + unsigned long last_addr; + unsigned i, pages; + + last_addr = offset + size - 1; + offset &= PAGE_MASK; + pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT; + if (pages_mapped + pages > MAX_EFI_IO_PAGES) + return NULL; + + for (i = 0; i < pages; i++) { + __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, + offset, PAGE_KERNEL_EXEC_NOCACHE); + offset += PAGE_SIZE; + pages_mapped++; + } + + return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ + (pages_mapped - pages)); +} diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S new file mode 100644 index 000000000000..99b47d48c9f4 --- /dev/null +++ b/arch/x86/kernel/efi_stub_64.S @@ -0,0 +1,109 @@ +/* + * Function calling ABI conversion from Linux to EFI for x86_64 + * + * Copyright (C) 2007 Intel Corp + * Bibo Mao <bibo.mao@intel.com> + * Huang Ying <ying.huang@intel.com> + */ + +#include <linux/linkage.h> + +#define SAVE_XMM \ + mov %rsp, %rax; \ + subq $0x70, %rsp; \ + and $~0xf, %rsp; \ + mov %rax, (%rsp); \ + mov %cr0, %rax; \ + clts; \ + mov %rax, 0x8(%rsp); \ + movaps %xmm0, 0x60(%rsp); \ + movaps %xmm1, 0x50(%rsp); \ + movaps %xmm2, 0x40(%rsp); \ + movaps %xmm3, 0x30(%rsp); \ + movaps %xmm4, 0x20(%rsp); \ + movaps %xmm5, 0x10(%rsp) + +#define RESTORE_XMM \ + movaps 0x60(%rsp), %xmm0; \ + movaps 0x50(%rsp), %xmm1; \ + movaps 0x40(%rsp), %xmm2; \ + movaps 0x30(%rsp), %xmm3; \ + movaps 0x20(%rsp), %xmm4; \ + movaps 0x10(%rsp), %xmm5; \ + mov 0x8(%rsp), %rsi; \ + mov %rsi, %cr0; \ + mov (%rsp), %rsp + +ENTRY(efi_call0) + SAVE_XMM + subq $32, %rsp + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call1) + SAVE_XMM + subq $32, %rsp + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call2) + SAVE_XMM + subq $32, %rsp + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call3) + SAVE_XMM + subq $32, %rsp + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call4) + SAVE_XMM + subq $32, %rsp + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call5) + SAVE_XMM + subq $48, %rsp + mov %r9, 32(%rsp) + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $48, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call6) + SAVE_XMM + mov (%rsp), %rax + mov 8(%rax), %rax + subq $48, %rsp + mov %r9, 32(%rsp) + mov %rax, 40(%rsp) + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $48, %rsp + RESTORE_XMM + ret diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index dc7f938e5015..be5c31d04884 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -58,7 +58,7 @@ * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). * * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). @@ -283,12 +283,12 @@ END(resume_kernel) the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(sysenter_entry) +ENTRY(ia32_sysenter_target) CFI_STARTPROC simple CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl TSS_sysenter_esp0(%esp),%esp + movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* * No need to follow this irqs on/off section: the syscall @@ -351,7 +351,7 @@ sysenter_past_esp: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs - ENABLE_INTERRUPTS_SYSEXIT + ENABLE_INTERRUPTS_SYSCALL_RET CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -360,7 +360,7 @@ sysenter_past_esp: .align 4 .long 1b,2b .popsection -ENDPROC(sysenter_entry) +ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) @@ -583,7 +583,7 @@ END(syscall_badsys) * Build the entry stubs and pointer table with * some assembler magic. */ -.data +.section .rodata,"a" ENTRY(interrupt) .text @@ -743,7 +743,7 @@ END(device_not_available) * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -755,7 +755,7 @@ END(device_not_available) cmpw $__KERNEL_CS,4(%esp); \ jne ok; \ label: \ - movl TSS_sysenter_esp0+offset(%esp),%esp; \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ CFI_DEF_CFA esp, 0; \ CFI_UNDEFINED eip; \ pushfl; \ @@ -768,7 +768,7 @@ label: \ KPROBE_ENTRY(debug) RING0_INT_FRAME - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: @@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 je nmi_espfix_stack - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct - cmpl $sysenter_entry,12(%esp) + cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ @@ -882,10 +882,10 @@ ENTRY(native_iret) .previous END(native_iret) -ENTRY(native_irq_enable_sysexit) +ENTRY(native_irq_enable_syscall_ret) sti sysexit -END(native_irq_enable_sysexit) +END(native_irq_enable_syscall_ret) #endif KPROBE_ENTRY(int3) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3a058bb16409..bea8474744ff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -50,6 +50,7 @@ #include <asm/hw_irq.h> #include <asm/page.h> #include <asm/irqflags.h> +#include <asm/paravirt.h> .code64 @@ -57,6 +58,13 @@ #define retint_kernel retint_restore_args #endif +#ifdef CONFIG_PARAVIRT +ENTRY(native_irq_enable_syscall_ret) + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq +#endif /* CONFIG_PARAVIRT */ + .macro TRACE_IRQS_IRETQ offset=ARGOFFSET #ifdef CONFIG_TRACE_IRQFLAGS @@ -216,14 +224,21 @@ ENTRY(system_call) CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ - swapgs + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +ENTRY(system_call_after_swapgs) + movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp /* * No need to follow this irqs off/on section - it's straight * and short: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) @@ -246,7 +261,7 @@ ret_from_sys_call: sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -260,9 +275,7 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 /*CFI_REGISTER rflags,r11*/ - movq %gs:pda_oldrsp,%rsp - swapgs - sysretq + ENABLE_INTERRUPTS_SYSCALL_RET CFI_RESTORE_STATE /* Handle reschedules */ @@ -271,7 +284,7 @@ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule @@ -282,8 +295,8 @@ sysret_careful: /* Handle a signal */ sysret_signal: TRACE_IRQS_ON - sti - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + ENABLE_INTERRUPTS(CLBR_NONE) + testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f /* Really a signal */ @@ -295,7 +308,7 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -327,7 +340,7 @@ tracesys: */ .globl int_ret_from_sys_call int_ret_from_sys_call: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args @@ -349,20 +362,20 @@ int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx @@ -377,7 +390,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -385,7 +398,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC @@ -506,7 +519,7 @@ END(stub_rt_sigreturn) CFI_DEF_CFA_REGISTER rbp testl $3,CS(%rdi) je 1f - swapgs + SWAPGS /* irqcount is used to check if a CPU is already on an interrupt stack or not. While this is essentially redundant with preempt_count it is a little cheaper to use a separate counter in the PDA @@ -527,7 +540,7 @@ ENTRY(common_interrupt) interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq @@ -556,13 +569,13 @@ retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ - cli + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - swapgs + SWAPGS jmp restore_args retint_restore_args: /* return to kernel space */ - cli + DISABLE_INTERRUPTS(CLBR_ANY) /* * The iretq could re-enable interrupts: */ @@ -570,10 +583,14 @@ retint_restore_args: /* return to kernel space */ restore_args: RESTORE_ARGS 0,8,0 iret_label: +#ifdef CONFIG_PARAVIRT + INTERRUPT_RETURN +#endif +ENTRY(native_iret) iretq .section __ex_table,"a" - .quad iret_label,bad_iret + .quad native_iret, bad_iret .previous .section .fixup,"ax" /* force a signal here? this matches i386 behaviour */ @@ -581,39 +598,39 @@ iret_label: bad_iret: movq $11,%rdi /* SIGSEGV */ TRACE_IRQS_ON - sti - jmp do_exit - .previous - + ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + jmp do_exit + .previous + /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx + testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_swapgs TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) @@ -731,7 +748,7 @@ END(spurious_interrupt) rdmsr testl %edx,%edx js 1f - swapgs + SWAPGS xorl %ebx,%ebx 1: .if \ist @@ -747,7 +764,7 @@ END(spurious_interrupt) .if \ist addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF .endif @@ -776,10 +793,10 @@ paranoid_swapgs\trace: .if \trace TRACE_IRQS_IRETQ 0 .endif - swapgs + SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - iretq + INTERRUPT_RETURN paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -794,11 +811,11 @@ paranoid_userspace\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -807,9 +824,9 @@ paranoid_schedule\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_ANY) call schedule - cli + DISABLE_INTERRUPTS(CLBR_ANY) .if \trace TRACE_IRQS_OFF .endif @@ -862,7 +879,7 @@ KPROBE_ENTRY(error_entry) testl $3,CS(%rsp) je error_kernelspace error_swapgs: - swapgs + SWAPGS error_sti: movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI @@ -874,7 +891,7 @@ error_sti: error_exit: movl %ebx,%eax RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax @@ -911,12 +928,12 @@ ENTRY(load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 - cli - swapgs + DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ - swapgs + SWAPGS popf CFI_ADJUST_CFA_OFFSET -8 ret @@ -930,7 +947,7 @@ ENDPROC(load_gs_index) .section .fixup,"ax" /* running with kernelgs */ bad_gs: - swapgs /* switch back to user gs */ + SWAPGS /* switch back to user gs */ xorl %eax,%eax movl %eax,%gs jmp 2b diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index ce703e21c912..4ae7b6440260 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -24,18 +24,11 @@ #include <acpi/acpi_bus.h> #endif -/* - * which logical CPU number maps to which CPU (physical APIC ID) - * - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +void *x86_cpu_to_apicid_early_ptr; +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); struct genapic __read_mostly *genapic = &apic_flat; diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index f12d8c5d9809..9c7f7d395968 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -1,6 +1,7 @@ /* * AMD Geode southbridge support code * Copyright (C) 2006, Advanced Micro Devices, Inc. + * Copyright (C) 2007, Andres Salomon <dilinger@debian.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public License @@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base); /* === GPIO API === */ -void geode_gpio_set(unsigned int gpio, unsigned int reg) +void geode_gpio_set(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); if (!base) return; - if (gpio < 16) - outl(1 << gpio, base + reg); - else - outl(1 << (gpio - 16), base + 0x80 + reg); + /* low bank register */ + if (gpio & 0xFFFF) + outl(gpio & 0xFFFF, base + reg); + /* high bank register */ + gpio >>= 16; + if (gpio) + outl(gpio, base + 0x80 + reg); } EXPORT_SYMBOL_GPL(geode_gpio_set); -void geode_gpio_clear(unsigned int gpio, unsigned int reg) +void geode_gpio_clear(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); if (!base) return; - if (gpio < 16) - outl(1 << (gpio + 16), base + reg); - else - outl(1 << gpio, base + 0x80 + reg); + /* low bank register */ + if (gpio & 0xFFFF) + outl((gpio & 0xFFFF) << 16, base + reg); + /* high bank register */ + gpio &= (0xFFFF << 16); + if (gpio) + outl(gpio, base + 0x80 + reg); } EXPORT_SYMBOL_GPL(geode_gpio_clear); -int geode_gpio_isset(unsigned int gpio, unsigned int reg) +int geode_gpio_isset(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 val; if (!base) return 0; - if (gpio < 16) - return (inl(base + reg) & (1 << gpio)) ? 1 : 0; - else - return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; + /* low bank register */ + if (gpio & 0xFFFF) { + val = inl(base + reg) & (gpio & 0xFFFF); + if ((gpio & 0xFFFF) == val) + return 1; + } + /* high bank register */ + gpio >>= 16; + if (gpio) { + val = inl(base + 0x80 + reg) & gpio; + if (gpio == val) + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(geode_gpio_isset); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6b3469311e42..a317336cdeaa 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/percpu.h> +#include <linux/start_kernel.h> #include <asm/processor.h> #include <asm/proto.h> @@ -19,12 +20,14 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/e820.h> static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + __flush_tlb_all(); } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data) } } +#define EBDA_ADDR_POINTER 0x40E + +static __init void reserve_ebda(void) +{ + unsigned ebda_addr, ebda_size; + + /* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); + ebda_addr <<= 4; + + if (!ebda_addr) + return; + + ebda_size = *(unsigned short *)__va(ebda_addr); + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; + + reserve_early(ebda_addr, ebda_addr + ebda_size); +} + void __init x86_64_start_kernel(char * real_mode_data) { int i; @@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); - for (i = 0; i < IDT_ENTRIES; i++) + for (i = 0; i < IDT_ENTRIES; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else set_intr_gate(i, early_idt_handler); +#endif + } load_idt((const struct desc_ptr *)&idt_descr); early_printk("Kernel alive\n"); @@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data) pda_init(0); copy_bootdata(__va(real_mode_data)); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif + + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end)); + + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + reserve_early(ramdisk_image, ramdisk_end); + } + + reserve_ebda(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fbad51fce672..5d8c5730686b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -9,6 +9,7 @@ .text #include <linux/threads.h> +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> @@ -151,7 +152,9 @@ WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ ud2a -.data + + __INITDATA + subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ @@ -199,7 +202,6 @@ default_entry: addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ movl %eax, 4092(%edx) - xorl %ebx,%ebx /* This is the boot CPU (BSP) */ jmp 3f /* * Non-boot CPU entry point; entered from trampoline.S @@ -222,6 +224,8 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs +#endif /* CONFIG_SMP */ +3: /* * New page tables may be in 4Mbyte page mode and may @@ -268,12 +272,6 @@ ENTRY(startup_32_smp) wrmsr 6: - /* This is a secondary processor (AP) */ - xorl %ebx,%ebx - incl %ebx - -#endif /* CONFIG_SMP */ -3: /* * Enable paging @@ -297,7 +295,7 @@ ENTRY(startup_32_smp) popfl #ifdef CONFIG_SMP - andl %ebx,%ebx + cmpb $0, ready jz 1f /* Initial CPU cleans BSS */ jmp checkCPUtype 1: @@ -502,6 +500,7 @@ early_fault: call printk #endif #endif + call dump_stack hlt_loop: hlt jmp hlt_loop diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b6167fe3330e..1d5a7a361200 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,6 +19,13 @@ #include <asm/msr.h> #include <asm/cache.h> +#ifdef CONFIG_PARAVIRT +#include <asm/asm-offsets.h> +#include <asm/paravirt.h> +#else +#define GET_CR2_INTO_RCX movq %cr2, %rcx +#endif + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE * because we need identity-mapped pages. * @@ -260,14 +267,43 @@ init_rsp: bad_address: jmp bad_address +#ifdef CONFIG_EARLY_PRINTK +.macro early_idt_tramp first, last + .ifgt \last-\first + early_idt_tramp \first, \last-1 + .endif + movl $\last,%esi + jmp early_idt_handler +.endm + + .globl early_idt_handlers +early_idt_handlers: + early_idt_tramp 0, 63 + early_idt_tramp 64, 127 + early_idt_tramp 128, 191 + early_idt_tramp 192, 255 +#endif + ENTRY(early_idt_handler) +#ifdef CONFIG_EARLY_PRINTK cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) + GET_CR2_INTO_RCX + movq %rcx,%r9 + xorl %r8d,%r8d # zero for error code + movl %esi,%ecx # get vector number + # Test %ecx against mask of vectors that push error code. + cmpl $31,%ecx + ja 0f + movl $1,%eax + salq %cl,%rax + testl $0x27d00,%eax + je 0f + popq %r8 # get error code +0: movq 0(%rsp),%rcx # get ip + movq 8(%rsp),%rdx # get cs xorl %eax,%eax - movq 8(%rsp),%rsi # get rip - movq (%rsp),%rdx - movq %cr2,%rcx leaq early_idt_msg(%rip),%rdi call early_printk cmpl $2,early_recursion_flag(%rip) @@ -278,15 +314,19 @@ ENTRY(early_idt_handler) movq 8(%rsp),%rsi # get rip again call __print_symbol #endif +#endif /* EARLY_PRINTK */ 1: hlt jmp 1b + +#ifdef CONFIG_EARLY_PRINTK early_recursion_flag: .long 0 early_idt_msg: - .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" + .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" early_idt_ripmsg: .asciz "RIP %s\n" +#endif /* CONFIG_EARLY_PRINTK */ .balign PAGE_SIZE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2f99ee206b95..429d084e014d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/sysdev.h> #include <linux/pm.h> -#include <linux/delay.h> #include <asm/fixmap.h> #include <asm/hpet.h> @@ -16,7 +15,8 @@ #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 -/* FSEC = 10^-15 NSEC = 10^-9 */ +/* FSEC = 10^-15 + NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 /* @@ -107,6 +107,7 @@ int is_hpet_enabled(void) { return is_hpet_capable() && hpet_legacy_int_enabled; } +EXPORT_SYMBOL_GPL(is_hpet_enabled); /* * When the hpet driver (/dev/hpet) is enabled, we need to reserve @@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id) #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif - hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; hpet_alloc(&hd); - } #else static void hpet_reserve_platform_timers(unsigned long id) { } @@ -478,6 +476,7 @@ void hpet_disable(void) */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> +#include <asm/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -492,6 +491,38 @@ static unsigned long hpet_default_delta; static unsigned long hpet_pie_delta; static unsigned long hpet_pie_limit; +static rtc_irq_handler irq_handler; + +/* + * Registers a IRQ handler. + */ +int hpet_register_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return -ENODEV; + if (irq_handler) + return -EBUSY; + + irq_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(hpet_register_irq_handler); + +/* + * Deregisters the IRQ handler registered with hpet_register_irq_handler() + * and does cleanup. + */ +void hpet_unregister_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return; + + irq_handler = NULL; + hpet_rtc_flags = 0; +} +EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); + /* * Timer 1 for RTC emulation. We use one shot mode, as periodic mode * is not supported by all HPET implementations for timer 1. @@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void) return 1; } +EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); /* * The functions below are called from rtc driver. @@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags &= ~bit_mask; return 1; } +EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); int hpet_set_rtc_irq_bit(unsigned long bit_mask) { @@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) return 1; } +EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) @@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min, return 1; } +EXPORT_SYMBOL_GPL(hpet_set_alarm_time); int hpet_set_periodic_freq(unsigned long freq) { @@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq) } return 1; } +EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); int hpet_rtc_dropped_irq(void) { return is_hpet_enabled(); } +EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { @@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) unsigned long rtc_int_flag = 0; hpet_rtc_timer_reinit(); + memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - rtc_get_rtc_time(&curr_time); + get_rtc_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { @@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (rtc_int_flag) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); + if (irq_handler) + irq_handler(rtc_int_flag, dev_id); } return IRQ_HANDLED; } +EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); #endif diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 02112fcc0de7..061627806a2d 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(empty_zero_page); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c new file mode 100644 index 000000000000..26719bd2c77c --- /dev/null +++ b/arch/x86/kernel/i387.c @@ -0,0 +1,479 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/regset.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/math_emu.h> +#include <asm/sigcontext.h> +#include <asm/user.h> +#include <asm/ptrace.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_X86_64 + +#include <asm/sigcontext32.h> +#include <asm/user32.h> + +#else + +#define save_i387_ia32 save_i387 +#define restore_i387_ia32 restore_i387 + +#define _fpstate_ia32 _fpstate +#define user_i387_ia32_struct user_i387_struct +#define user32_fxsr_struct user_fxsr_struct + +#endif + +#ifdef CONFIG_MATH_EMULATION +#define HAVE_HWFP (boot_cpu_data.hard_math) +#else +#define HAVE_HWFP 1 +#endif + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; + +void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + clts(); + if (cpu_has_fxsr) { + memset(¤t->thread.i387.fxsave, 0, + sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; + stts(); +} + +#ifdef CONFIG_X86_64 +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ +void __cpuinit fpu_init(void) +{ + unsigned long oldcr0 = read_cr0(); + extern void __bad_fxsave_alignment(void); + + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) + __bad_fxsave_alignment(); + set_in_cr4(X86_CR4_OSFXSR); + set_in_cr4(X86_CR4_OSXMMEXCPT); + + write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ + + mxcsr_feature_mask_init(); + /* clean state in init */ + current_thread_info()->status = 0; + clear_used_math(); +} +#endif /* CONFIG_X86_64 */ + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +void init_fpu(struct task_struct *tsk) +{ + if (tsk_used_math(tsk)) { + if (tsk == current) + unlazy_fpu(tsk); + return; + } + + if (cpu_has_fxsr) { + memset(&tsk->thread.i387.fxsave, 0, + sizeof(struct i387_fxsave_struct)); + tsk->thread.i387.fxsave.cwd = 0x37f; + if (cpu_has_xmm) + tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT; + } else { + memset(&tsk->thread.i387.fsave, 0, + sizeof(struct i387_fsave_struct)); + tsk->thread.i387.fsave.cwd = 0xffff037fu; + tsk->thread.i387.fsave.swd = 0xffff0000u; + tsk->thread.i387.fsave.twd = 0xffffffffu; + tsk->thread.i387.fsave.fos = 0xffff0000u; + } + /* + * Only the device not available exception or ptrace can call init_fpu. + */ + set_stopped_child_used_math(tsk); +} + +int fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return tsk_used_math(target) ? regset->n : 0; +} + +int xfpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (!cpu_has_fxsr) + return -ENODEV; + + unlazy_fpu(target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + unlazy_fpu(target); + set_stopped_child_used_math(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +static void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk) +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + if (tsk == current) { + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + asm("mov %%ds,%0" : "=r" (env->fos)); + asm("mov %%cs,%0" : "=r" (env->fcs)); + } else { + struct pt_regs *regs = task_pt_regs(tsk); + env->fos = 0xffff0000 | tsk->thread.ds; + env->fcs = regs->cs; + } +#else + env->fip = fxsave->fip; + env->fcs = fxsave->fcs; + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +static void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct user_i387_ia32_struct env; + + if (!HAVE_HWFP) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + unlazy_fpu(target); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fsave, 0, -1); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + if (!HAVE_HWFP) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + unlazy_fpu(target); + set_stopped_child_used_math(target); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fsave, 0, -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + return ret; +} + +/* + * Signal frame handlers. + */ + +static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + + unlazy_fpu(tsk); + tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; + if (__copy_to_user(buf, &tsk->thread.i387.fsave, + sizeof(struct i387_fsave_struct))) + return -1; + return 1; +} + +static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + struct user_i387_ia32_struct env; + int err = 0; + + unlazy_fpu(tsk); + + convert_from_fxsr(&env, tsk); + if (__copy_to_user(buf, &env, sizeof(env))) + return -1; + + err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); + err |= __put_user(X86_FXSR_MAGIC, &buf->magic); + if (err) + return -1; + + if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + return 1; +} + +int save_i387_ia32(struct _fpstate_ia32 __user *buf) +{ + if (!used_math()) + return 0; + + /* This will cause a "finit" to be triggered by the next + * attempted FPU operation by the 'current' process. + */ + clear_used_math(); + + if (HAVE_HWFP) { + if (cpu_has_fxsr) { + return save_i387_fxsave(buf); + } else { + return save_i387_fsave(buf); + } + } else { + return fpregs_soft_get(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) ? -1 : 1; + } +} + +static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + clear_fpu(tsk); + return __copy_from_user(&tsk->thread.i387.fsave, buf, + sizeof(struct i387_fsave_struct)); +} + +static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) +{ + int err; + struct task_struct *tsk = current; + struct user_i387_ia32_struct env; + clear_fpu(tsk); + err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0], + sizeof(struct i387_fxsave_struct)); + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + if (err || __copy_from_user(&env, buf, sizeof(env))) + return 1; + convert_to_fxsr(tsk, &env); + return 0; +} + +int restore_i387_ia32(struct _fpstate_ia32 __user *buf) +{ + int err; + + if (HAVE_HWFP) { + if (cpu_has_fxsr) { + err = restore_i387_fxsave(buf); + } else { + err = restore_i387_fsave(buf); + } + } else { + err = fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + } + set_used_math(); + return err; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) +{ + int fpvalid; + struct task_struct *tsk = current; + + fpvalid = !!used_math(); + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + fpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c deleted file mode 100644 index 7d2e12f6c78b..000000000000 --- a/arch/x86/kernel/i387_32.c +++ /dev/null @@ -1,544 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -#include <linux/sched.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/math_emu.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -#ifdef CONFIG_MATH_EMULATION -#define HAVE_HWFP (boot_cpu_data.hard_math) -#else -#define HAVE_HWFP 1 -#endif - -static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned long mask = 0; - clts(); - if (cpu_has_fxsr) { - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - } - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. - */ -void init_fpu(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - tsk->thread.i387.fxsave.cwd = 0x37f; - if (cpu_has_xmm) - tsk->thread.i387.fxsave.mxcsr = 0x1f80; - } else { - memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct)); - tsk->thread.i387.fsave.cwd = 0xffff037fu; - tsk->thread.i387.fsave.swd = 0xffff0000u; - tsk->thread.i387.fsave.twd = 0xffffffffu; - tsk->thread.i387.fsave.fos = 0xffff0000u; - } - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(tsk); -} - -/* - * FPU lazy state save handling. - */ - -void kernel_fpu_begin(void) -{ - struct thread_info *thread = current_thread_info(); - - preempt_disable(); - if (thread->status & TS_USEDFPU) { - __save_init_fpu(thread->task); - return; - } - clts(); -} -EXPORT_SYMBOL_GPL(kernel_fpu_begin); - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) -{ - struct _fpxreg *st = NULL; - unsigned long tos = (fxsave->swd >> 11) & 7; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000u; - int i; - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); - - for ( i = 0 ; i < 8 ; i++ ) { - if ( twd & 0x1 ) { - st = FPREG_ADDR( fxsave, (i - tos) & 7 ); - - switch ( st->exponent & 0x7fff ) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if ( st->significand[3] & 0x8000 ) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - -/* - * FPU state interaction. - */ - -unsigned short get_fpu_cwd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.cwd; - } else { - return (unsigned short)tsk->thread.i387.fsave.cwd; - } -} - -unsigned short get_fpu_swd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.swd; - } else { - return (unsigned short)tsk->thread.i387.fsave.swd; - } -} - -#if 0 -unsigned short get_fpu_twd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.twd; - } else { - return (unsigned short)tsk->thread.i387.fsave.twd; - } -} -#endif /* 0 */ - -unsigned short get_fpu_mxcsr( struct task_struct *tsk ) -{ - if ( cpu_has_xmm ) { - return tsk->thread.i387.fxsave.mxcsr; - } else { - return 0x1f80; - } -} - -#if 0 - -void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.cwd = cwd; - } else { - tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u); - } -} - -void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.swd = swd; - } else { - tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u); - } -} - -void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); - } else { - tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u); - } -} - -#endif /* 0 */ - -/* - * FXSR floating point environment conversions. - */ - -static int convert_fxsr_to_user( struct _fpstate __user *buf, - struct i387_fxsave_struct *fxsave ) -{ - unsigned long env[7]; - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - - env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; - env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; - env[2] = twd_fxsr_to_i387(fxsave); - env[3] = fxsave->fip; - env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); - env[5] = fxsave->foo; - env[6] = fxsave->fos; - - if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) - return 1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - unsigned long __user *t = (unsigned long __user *)to; - unsigned long *f = (unsigned long *)from; - - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; - } - return 0; -} - -static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, - struct _fpstate __user *buf ) -{ - unsigned long env[7]; - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - - if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) - return 1; - - fxsave->cwd = (unsigned short)(env[0] & 0xffff); - fxsave->swd = (unsigned short)(env[1] & 0xffff); - fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); - fxsave->fip = env[3]; - fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); - fxsave->fcs = (env[4] & 0xffff); - fxsave->foo = env[5]; - fxsave->fos = env[6]; - - to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - unsigned long *t = (unsigned long *)to; - unsigned long __user *f = (unsigned long __user *)from; - - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; - } - return 0; -} - -/* - * Signal frame handlers. - */ - -static inline int save_i387_fsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - - unlazy_fpu( tsk ); - tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; - if ( __copy_to_user( buf, &tsk->thread.i387.fsave, - sizeof(struct i387_fsave_struct) ) ) - return -1; - return 1; -} - -static int save_i387_fxsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - int err = 0; - - unlazy_fpu( tsk ); - - if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) ) - return -1; - - err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status ); - err |= __put_user( X86_FXSR_MAGIC, &buf->magic ); - if ( err ) - return -1; - - if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct) ) ) - return -1; - return 1; -} - -int save_i387( struct _fpstate __user *buf ) -{ - if ( !used_math() ) - return 0; - - /* This will cause a "finit" to be triggered by the next - * attempted FPU operation by the 'current' process. - */ - clear_used_math(); - - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return save_i387_fxsave( buf ); - } else { - return save_i387_fsave( buf ); - } - } else { - return save_i387_soft( ¤t->thread.i387.soft, buf ); - } -} - -static inline int restore_i387_fsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - clear_fpu( tsk ); - return __copy_from_user( &tsk->thread.i387.fsave, buf, - sizeof(struct i387_fsave_struct) ); -} - -static int restore_i387_fxsave( struct _fpstate __user *buf ) -{ - int err; - struct task_struct *tsk = current; - clear_fpu( tsk ); - err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct) ); - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf ); -} - -int restore_i387( struct _fpstate __user *buf ) -{ - int err; - - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - err = restore_i387_fxsave( buf ); - } else { - err = restore_i387_fsave( buf ); - } - } else { - err = restore_i387_soft( ¤t->thread.i387.soft, buf ); - } - set_used_math(); - return err; -} - -/* - * ptrace request handlers. - */ - -static inline int get_fpregs_fsave( struct user_i387_struct __user *buf, - struct task_struct *tsk ) -{ - return __copy_to_user( buf, &tsk->thread.i387.fsave, - sizeof(struct user_i387_struct) ); -} - -static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf, - struct task_struct *tsk ) -{ - return convert_fxsr_to_user( (struct _fpstate __user *)buf, - &tsk->thread.i387.fxsave ); -} - -int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk ) -{ - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return get_fpregs_fxsave( buf, tsk ); - } else { - return get_fpregs_fsave( buf, tsk ); - } - } else { - return save_i387_soft( &tsk->thread.i387.soft, - (struct _fpstate __user *)buf ); - } -} - -static inline int set_fpregs_fsave( struct task_struct *tsk, - struct user_i387_struct __user *buf ) -{ - return __copy_from_user( &tsk->thread.i387.fsave, buf, - sizeof(struct user_i387_struct) ); -} - -static inline int set_fpregs_fxsave( struct task_struct *tsk, - struct user_i387_struct __user *buf ) -{ - return convert_fxsr_from_user( &tsk->thread.i387.fxsave, - (struct _fpstate __user *)buf ); -} - -int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf ) -{ - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return set_fpregs_fxsave( tsk, buf ); - } else { - return set_fpregs_fsave( tsk, buf ); - } - } else { - return restore_i387_soft( &tsk->thread.i387.soft, - (struct _fpstate __user *)buf ); - } -} - -int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - if (__copy_to_user( buf, &tsk->thread.i387.fxsave, - sizeof(struct user_fxsr_struct) )) - return -EFAULT; - return 0; - } else { - return -EIO; - } -} - -int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf ) -{ - int ret = 0; - - if ( cpu_has_fxsr ) { - if (__copy_from_user( &tsk->thread.i387.fxsave, buf, - sizeof(struct user_fxsr_struct) )) - ret = -EFAULT; - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - } else { - ret = -EIO; - } - return ret; -} - -/* - * FPU state for core dumps. - */ - -static inline void copy_fpu_fsave( struct task_struct *tsk, - struct user_i387_struct *fpu ) -{ - memcpy( fpu, &tsk->thread.i387.fsave, - sizeof(struct user_i387_struct) ); -} - -static inline void copy_fpu_fxsave( struct task_struct *tsk, - struct user_i387_struct *fpu ) -{ - unsigned short *to; - unsigned short *from; - int i; - - memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) ); - - to = (unsigned short *)&fpu->st_space[0]; - from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0]; - for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { - memcpy( to, from, 5 * sizeof(unsigned short) ); - } -} - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - int fpvalid; - struct task_struct *tsk = current; - - fpvalid = !!used_math(); - if ( fpvalid ) { - unlazy_fpu( tsk ); - if ( cpu_has_fxsr ) { - copy_fpu_fxsave( tsk, fpu ); - } else { - copy_fpu_fsave( tsk, fpu ); - } - } - - return fpvalid; -} -EXPORT_SYMBOL(dump_fpu); - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - if (cpu_has_fxsr) - copy_fpu_fxsave(tsk, fpu); - else - copy_fpu_fsave(tsk, fpu); - } - return fpvalid; -} - -int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu) -{ - int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr; - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu)); - } - return fpvalid; -} diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c deleted file mode 100644 index bfaff28fb134..000000000000 --- a/arch/x86/kernel/i387_64.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 rework 2002 Andi Kleen. - * Does direct fxsave in and out of user space now for signal handlers. - * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, - * the 64bit user space sees a FXSAVE frame directly. - */ - -#include <linux/sched.h> -#include <linux/init.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned int mask; - clts(); - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. - */ -void __cpuinit fpu_init(void) -{ - unsigned long oldcr0 = read_cr0(); - extern void __bad_fxsave_alignment(void); - - if (offsetof(struct task_struct, thread.i387.fxsave) & 15) - __bad_fxsave_alignment(); - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); - - write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ - - mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); -} - -void init_fpu(struct task_struct *child) -{ - if (tsk_used_math(child)) { - if (child == current) - unlazy_fpu(child); - return; - } - memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - child->thread.i387.fxsave.cwd = 0x37f; - child->thread.i387.fxsave.mxcsr = 0x1f80; - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(child); -} - -/* - * Signal frame handlers. - */ - -int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.i387.fxsave)); - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n",buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *)buf); - if (err) return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; -} - -/* - * ptrace request handlers. - */ - -int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) -{ - init_fpu(tsk); - return __copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct user_i387_struct)) ? -EFAULT : 0; -} - -int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) -{ - if (__copy_from_user(&tsk->thread.i387.fxsave, buf, - sizeof(struct user_i387_struct))) - return -EFAULT; - return 0; -} - -/* - * FPU state for core dumps. - */ - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); - return 1; -} - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); -} - return fpvalid; -} diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index 29313832df0c..dbd6c1d1b638 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -51,7 +51,7 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class i8237_sysdev_class = { - set_kset_name("i8237"), + .name = "i8237", .suspend = i8237A_suspend, .resume = i8237A_resume, }; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index a42c80745325..ef62b07b2b48 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -13,10 +13,17 @@ #include <asm/delay.h> #include <asm/i8253.h> #include <asm/io.h> +#include <asm/hpet.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); +#ifdef CONFIG_X86_32 +static void pit_disable_clocksource(void); +#else +static inline void pit_disable_clocksource(void) { } +#endif + /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event; static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); + spin_lock(&i8253_lock); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_pit(0x34, PIT_MODE); + outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */ + outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */ break; case CLOCK_EVT_MODE_SHUTDOWN: case CLOCK_EVT_MODE_UNUSED: if (evt->mode == CLOCK_EVT_MODE_PERIODIC || evt->mode == CLOCK_EVT_MODE_ONESHOT) { - outb_p(0x30, PIT_MODE); - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); + outb_pit(0x30, PIT_MODE); + outb_pit(0, PIT_CH0); + outb_pit(0, PIT_CH0); } + pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - outb_p(0x38, PIT_MODE); + pit_disable_clocksource(); + outb_pit(0x38, PIT_MODE); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ break; } - spin_unlock_irqrestore(&i8253_lock, flags); + spin_unlock(&i8253_lock); } /* @@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(delta & 0xff , PIT_CH0); /* LSB */ - outb(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); + spin_lock(&i8253_lock); + outb_pit(delta & 0xff , PIT_CH0); /* LSB */ + outb_pit(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock(&i8253_lock); return 0; } @@ -148,15 +153,15 @@ static cycle_t pit_read(void) * count), it cannot be newer. */ jifs = jiffies; - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb_p(PIT_CH0) << 8; + outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_pit(PIT_CH0); /* read the latched count */ + count |= inb_pit(PIT_CH0) << 8; /* VIA686a test code... reset the latch if count > max + 1 */ if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); + outb_pit(0x34, PIT_MODE); + outb_pit(LATCH & 0xff, PIT_CH0); + outb_pit(LATCH >> 8, PIT_CH0); count = LATCH - 1; } @@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = { .shift = 20, }; +static void pit_disable_clocksource(void) +{ + /* + * Use mult to check whether it is registered or not + */ + if (clocksource_pit.mult) { + clocksource_unregister(&clocksource_pit); + clocksource_pit.mult = 0; + } +} + static int __init init_pit_clocksource(void) { - if (num_possible_cpus() > 1) /* PIT does not scale! */ + /* + * Several reasons not to register PIT as a clocksource: + * + * - On SMP PIT does not scale due to i8253_lock + * - when HPET is enabled + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || + pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) return 0; clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index f634fc715c99..2d25b77102fe 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,8 +21,6 @@ #include <asm/arch_hooks.h> #include <asm/i8259.h> -#include <io_ports.h> - /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. @@ -258,7 +256,7 @@ static int i8259A_shutdown(struct sys_device *dev) } static struct sysdev_class i8259_sysdev_class = { - set_kset_name("i8259"), + .name = "i8259", .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, @@ -291,20 +289,20 @@ void init_8259A(int auto_eoi) outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ /* - * outb_p - this has to work on a wide range of PC hardware. + * outb_pic - this has to work on a wide range of PC hardware. */ - outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ - outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt @@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0,0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error((void __user *)get_irq_regs()->eip); + math_error((void __user *)get_irq_regs()->ip); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index 3f27ea0b9816..fa57a1568508 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -21,6 +21,7 @@ #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> +#include <asm/i8259.h> /* * Common place to define all x86 IRQ vectors @@ -48,7 +49,7 @@ */ /* - * The IO-APIC gives us many more interrupt sources. Most of these + * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... * sometimes (mostly wrt. hw bugs) we get corrupted vectors all * across the spectrum, so we really want to be prepared to get all @@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) /* for the irq vectors */ -static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), @@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = { /* * This contains the irq mask for both 8259A irq controllers, */ -static unsigned int cached_irq_mask = 0xffff; - -#define __byte(x,y) (((unsigned char *)&(y))[x]) -#define cached_21 (__byte(0,cached_irq_mask)) -#define cached_A1 (__byte(1,cached_irq_mask)) +unsigned int cached_irq_mask = 0xffff; /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) @@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) - outb(cached_A1,0xA1); + outb(cached_slave_mask, PIC_SLAVE_IMR); else - outb(cached_21,0x21); + outb(cached_master_mask, PIC_MASTER_IMR); spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) - outb(cached_A1,0xA1); + outb(cached_slave_mask, PIC_SLAVE_IMR); else - outb(cached_21,0x21); + outb(cached_master_mask, PIC_MASTER_IMR); spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) - ret = inb(0x20) & mask; + ret = inb(PIC_MASTER_CMD) & mask; else - ret = inb(0xA0) & (mask >> 8); + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); spin_unlock_irqrestore(&i8259A_lock, flags); return ret; @@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<<irq; if (irq < 8) { - outb(0x0B,0x20); /* ISR register */ - value = inb(0x20) & irqmask; - outb(0x0A,0x20); /* back to the IRR register */ + outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ return value; } - outb(0x0B,0xA0); /* ISR register */ - value = inb(0xA0) & (irqmask >> 8); - outb(0x0A,0xA0); /* back to the IRR register */ + outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ return value; } @@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq) handle_real_irq: if (irq & 8) { - inb(0xA1); /* DUMMY - (do we need this?) */ - outb(cached_A1,0xA1); - outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ - outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); } else { - inb(0x21); /* DUMMY - (do we need this?) */ - outb(cached_21,0x21); - outb(0x60+irq,0x20); /* 'Specific EOI' to master */ + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -270,7 +270,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); @@ -283,51 +284,6 @@ spurious_8259A_irq: } } -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-2 */ - - /* - * outb_p - this has to work on a wide range of PC hardware. - */ - outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ - outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) - outb_p(0x03, 0x21); /* master does Auto EOI */ - else - outb_p(0x01, 0x21); /* master expects normal EOI */ - - outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ - outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ - outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode - is to be investigated) */ - - if (auto_eoi) - /* - * in AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_21, 0x21); /* restore master IRQ mask */ - outb(cached_A1, 0xA1); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - static char irq_trigger[2]; /** * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ @@ -364,13 +320,13 @@ static int i8259A_shutdown(struct sys_device *dev) * the kernel initialization code can get it * out of. */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ return 0; } static struct sysdev_class i8259_sysdev_class = { - set_kset_name("i8259"), + .name = "i8259", .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, @@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + + + + /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -448,7 +456,9 @@ void __init init_ISA_irqs (void) } } -void __init init_IRQ(void) +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) { int i; diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 468c9c437842..5b3ce7934363 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_SYMBOL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a6b1490e00c4..4ca548632c8d 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -35,6 +35,7 @@ #include <linux/htirq.h> #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ #include <asm/io.h> #include <asm/smp.h> @@ -48,8 +49,6 @@ #include <mach_apic.h> #include <mach_apicdef.h> -#include "io_ports.h" - int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; @@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <asm/processor.h> /* kernel_thread() */ # include <linux/kernel_stat.h> /* kstat */ # include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> /* time_after() */ +# include <linux/timer.h> #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) @@ -727,7 +726,7 @@ late_initcall(balanced_irq_init); #endif /* CONFIG_SMP */ #ifndef CONFIG_SMP -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { unsigned int cfg; @@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; @@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = { .eoi = ack_apic, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -2093,7 +2092,7 @@ static void setup_nmi (void) */ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + enable_NMI_through_LVT0(); apic_printk(APIC_VERBOSE, " done.\n"); } @@ -2401,7 +2400,7 @@ static int ioapic_resume(struct sys_device *dev) } static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), + .name = "ioapic", .suspend = ioapic_suspend, .resume = ioapic_resume, }; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index cbac1670c7c3..1627c0d53e0b 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -32,9 +32,11 @@ #include <linux/msi.h> #include <linux/htirq.h> #include <linux/dmar.h> +#include <linux/jiffies.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif +#include <linux/bootmem.h> #include <asm/idle.h> #include <asm/io.h> @@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); @@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void) #endif /* 0 */ -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; @@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void) */ /* jiffies wrap? */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; } @@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq) if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_rax; + vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; @@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq) int do_unmask_irq = 0; irq_complete_move(irq); -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; @@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = { .end = end_lapic_irq, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -1578,7 +1580,7 @@ static void setup_nmi (void) */ printk(KERN_INFO "activating NMI Watchdog ..."); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); printk(" done.\n"); } @@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void) * * FIXME: really need to revamp this for modern platforms only. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; @@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck); void __init setup_IO_APIC(void) { - enable_IO_APIC(); + + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ if (acpi_ioapic) io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ @@ -1850,7 +1855,7 @@ static int ioapic_resume(struct sys_device *dev) } static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), + .name = "ioapic", .suspend = ioapic_suspend, .resume = ioapic_resume, }; @@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void) } #endif +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); + diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c new file mode 100644 index 000000000000..bd49321034db --- /dev/null +++ b/arch/x86/kernel/io_delay.c @@ -0,0 +1,114 @@ +/* + * I/O delay strategies for inb_p/outb_p + * + * Allow for a DMI based override of port 0x80, needed for certain HP laptops + * and possibly other systems. Also allow for the gradual elimination of + * outb_p/inb_p API uses. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/dmi.h> +#include <asm/io.h> + +int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; +EXPORT_SYMBOL_GPL(io_delay_type); + +static int __initdata io_delay_override; + +/* + * Paravirt wants native_io_delay to be a constant. + */ +void native_io_delay(void) +{ + switch (io_delay_type) { + default: + case CONFIG_IO_DELAY_TYPE_0X80: + asm volatile ("outb %al, $0x80"); + break; + case CONFIG_IO_DELAY_TYPE_0XED: + asm volatile ("outb %al, $0xed"); + break; + case CONFIG_IO_DELAY_TYPE_UDELAY: + /* + * 2 usecs is an upper-bound for the outb delay but + * note that udelay doesn't have the bus-level + * side-effects that outb does, nor does udelay() have + * precise timings during very early bootup (the delays + * are shorter until calibrated): + */ + udelay(2); + case CONFIG_IO_DELAY_TYPE_NONE: + break; + } +} +EXPORT_SYMBOL(native_io_delay); + +static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) +{ + if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { + printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", + id->ident); + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + } + + return 0; +} + +/* + * Quirk table for systems that misbehave (lock up, etc.) if port + * 0x80 is used: + */ +static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { + { + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { } +}; + +void __init io_delay_init(void) +{ + if (!io_delay_override) + dmi_check_system(io_delay_0xed_port_dmi_table); +} + +static int __init io_delay_param(char *s) +{ + if (!strcmp(s, "0x80")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; + else if (!strcmp(s, "0xed")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + else if (!strcmp(s, "udelay")) + io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY; + else if (!strcmp(s, "none")) + io_delay_type = CONFIG_IO_DELAY_TYPE_NONE; + else + return -EINVAL; + + io_delay_override = 1; + return 0; +} + +early_param("io_delay", io_delay_param); diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c index 4ed48dc8df1e..50e5e4a31c85 100644 --- a/arch/x86/kernel/ioport_32.c +++ b/arch/x86/kernel/ioport.c @@ -1,6 +1,6 @@ /* * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. + * by Linus. 32/64 bits code unification by Miguel Botón. */ #include <linux/sched.h> @@ -16,49 +16,27 @@ #include <linux/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +static void set_bitmap(unsigned long *bitmap, unsigned int base, + unsigned int extent, int new_value) { - unsigned long mask; - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); - unsigned int low_index = base & (BITS_PER_LONG-1); - int length = low_index + extent; - - if (low_index != 0) { - mask = (~0UL << low_index); - if (length < BITS_PER_LONG) - mask &= ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - length -= BITS_PER_LONG; - } - - mask = (new_value ? ~0UL : 0UL); - while (length >= BITS_PER_LONG) { - *bitmap_base++ = mask; - length -= BITS_PER_LONG; - } + unsigned int i; - if (length > 0) { - mask = ~(~0UL << length); + for (i = base; i < base + extent; i++) { if (new_value) - *bitmap_base++ |= mask; + __set_bit(i, bitmap); else - *bitmap_base++ &= ~mask; + __clear_bit(i, bitmap); } } - /* * this changes the io permissions bitmap in the current task. */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - unsigned long i, max_long, bytes, bytes_updated; struct thread_struct * t = ¤t->thread; struct tss_struct * tss; - unsigned long *bitmap; + unsigned int i, max_long, bytes, bytes_updated; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; @@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * this is why we delay this operation until now: */ if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) return -ENOMEM; @@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) if (t->io_bitmap_ptr[i] != ~0UL) max_long = i; - bytes = (max_long + 1) * sizeof(long); + bytes = (max_long + 1) * sizeof(unsigned long); bytes_updated = max(bytes, t->io_bitmap_max); t->io_bitmap_max = bytes; +#ifdef CONFIG_X86_32 /* * Sets the lazy trigger so that the next I/O operation will * reload the correct bitmap. @@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; tss->io_bitmap_owner = NULL; +#else + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); +#endif put_cpu(); @@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * beyond the 0x3ff range: to get the full 65536 ports bitmapped * you'd need 8kB of bitmaps/process, which is a bit excessive. * - * Here we just change the eflags value on the stack: we allow + * Here we just change the flags value on the stack: we allow * only the super-user to do it. This depends on the stack-layout * on system-call entry - see also fork() and the signal handling * code. */ - -asmlinkage long sys_iopl(unsigned long unused) +static int do_iopl(unsigned int level, struct pt_regs *regs) { - volatile struct pt_regs * regs = (struct pt_regs *) &unused; - unsigned int level = regs->ebx; - unsigned int old = (regs->eflags >> 12) & 3; - struct thread_struct *t = ¤t->thread; + unsigned int old = (regs->flags >> 12) & 3; if (level > 3) return -EINVAL; @@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); + + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage long sys_iopl(unsigned long regsp) +{ + struct pt_regs *regs = (struct pt_regs *)®sp; + unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; + int rc; + + rc = do_iopl(level, regs); + if (rc < 0) + goto out; + t->iopl = level << 12; - regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl; set_iopl_mask(t->iopl); - return 0; +out: + return rc; +} +#else +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + return do_iopl(level, regs); } +#endif diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c deleted file mode 100644 index 5f62fad64dab..000000000000 --- a/arch/x86/kernel/ioport_64.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <linux/syscalls.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - unsigned int i, max_long, bytes, bytes_updated; - struct thread_struct * t = ¤t->thread; - struct tss_struct * tss; - unsigned long *bitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - } - - /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ - tss = &per_cpu(init_tss, get_cpu()); - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - /* - * Search for a (possibly new) maximum. This is simple and stupid, - * to keep it obviously correct: - */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) - max_long = i; - - bytes = (max_long + 1) * sizeof(long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); - - put_cpu(); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - unsigned int old = (regs->eflags >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); - return 0; -} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index d3fde94f7345..cef054b09d27 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_eax; + int irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { - long esp; + long sp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (THREAD_SIZE - 1)); - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + "=r" (sp) : "0" (THREAD_SIZE - 1)); + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { printk("do_IRQ: stack overflow: %ld\n", - esp - sizeof(struct thread_info)); + sp - sizeof(struct thread_info)); dump_stack(); } } @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, bx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) : "0" (irq), "1" (desc), "2" (isp), "D" (desc->handle_irq) : "memory", "cc" diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6b5c730d67b9..3aac15466a91 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,26 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +} + #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs) u64 curbase = (u64)task_stack_page(current); static unsigned long warned = -60*HZ; - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && - regs->rsp < curbase + sizeof(struct thread_info) + 128 && + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + 128 && time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", - current->comm, curbase, regs->rsp); + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); show_stack(NULL,NULL); warned = jiffies; } @@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_rax; + unsigned vector = ~regs->orig_ax; unsigned irq; exit_idle(); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c new file mode 100644 index 000000000000..73354302fda7 --- /dev/null +++ b/arch/x86/kernel/kdebugfs.c @@ -0,0 +1,65 @@ +/* + * Architecture specific debugfs files + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/debugfs.h> +#include <linux/stat.h> +#include <linux/init.h> + +#include <asm/setup.h> + +#ifdef CONFIG_DEBUG_BOOT_PARAMS +static struct debugfs_blob_wrapper boot_params_blob = { + .data = &boot_params, + .size = sizeof(boot_params), +}; + +static int __init boot_params_kdebugfs_init(void) +{ + int error; + struct dentry *dbp, *version, *data; + + dbp = debugfs_create_dir("boot_params", NULL); + if (!dbp) { + error = -ENOMEM; + goto err_return; + } + version = debugfs_create_x16("version", S_IRUGO, dbp, + &boot_params.hdr.version); + if (!version) { + error = -ENOMEM; + goto err_dir; + } + data = debugfs_create_blob("data", S_IRUGO, dbp, + &boot_params_blob); + if (!data) { + error = -ENOMEM; + goto err_version; + } + return 0; +err_version: + debugfs_remove(version); +err_dir: + debugfs_remove(dbp); +err_return: + return error; +} +#endif + +static int __init arch_kdebugfs_init(void) +{ + int error = 0; + +#ifdef CONFIG_DEBUG_BOOT_PARAMS + error = boot_params_kdebugfs_init(); +#endif + + return error; +} + +arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c new file mode 100644 index 000000000000..a99e764fd66a --- /dev/null +++ b/arch/x86/kernel/kprobes.c @@ -0,0 +1,1066 @@ +/* + * Kernel Probes (KProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> adapted for x86_64 from i386. + * 2005-Mar Roland McGrath <roland@redhat.com> + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston + * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> added function-return probes. + * 2005-May Rusty Lynch <rusty.lynch@intel.com> + * Added function return probes functionality + * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added + * kprobe-booster and kretprobe-booster for i386. + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster + * and kretprobe-booster for x86-64 + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven + * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> + * unified x86 kprobes code. + */ + +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> + +#include <asm/cacheflush.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/alternative.h> + +void jprobe_return_end(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#ifdef CONFIG_X86_64 +#define stack_addr(regs) ((unsigned long *)regs->sp) +#else +/* + * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs + * don't save the ss and esp registers if the CPU is already in kernel + * mode when it traps. So for kprobes, regs->sp and regs->ss are not + * the [nonexistent] saved stack pointer and ss register, but rather + * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to + * point to the top of the pre-int3 stack. + */ +#define stack_addr(regs) ((unsigned long *)®s->sp) +#endif + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + */ +static const u32 twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u32 onebyte_has_modrm[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ----------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ + W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ + W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u32 twobyte_has_modrm[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ----------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ + W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ + W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ + W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes set_jmp_op(void *from, void *to) +{ + struct __arch_jmp_op { + char op; + s32 raddr; + } __attribute__((packed)) * jop; + jop = (struct __arch_jmp_op *)from; + jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); + jop->op = RELATIVEJUMP_INSTRUCTION; +} + +/* + * Check for the REX prefix which can only exist on X86_64 + * X86_32 always returns 0 + */ +static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +{ +#ifdef CONFIG_X86_64 + if ((*insn & 0xf0) == 0x40) + return 1; +#endif + return 0; +} + +/* + * Returns non-zero if opcode is boostable. + * RIP relative instructions are adjusted at copying time in 64 bits mode + */ +static int __kprobes can_boost(kprobe_opcode_t *opcodes) +{ + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; + +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, + (unsigned long *)twobyte_is_boostable); + } + + switch (opcode & 0xf0) { +#ifdef CONFIG_X86_64 + case 0x40: + goto retry; /* REX prefix is boostable */ +#endif + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); + case 0x70: + return 0; /* can't boost conditional jump */ + case 0xc0: + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; + case 0xd0: + /* can boost AA* and XLAT */ + return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); + case 0xe0: + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); + case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ + /* clear and set flags are boostable */ + return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + default: + /* segment override prefixes are boostable */ + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* CS override prefix and call are not boostable */ + return (opcode != 0x2e && opcode != 0x9a); + } +} + +/* + * Returns non-zero if opcode modifies the interrupt flag. + */ +static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) +{ + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + /* + * on X86_64, 0x40-0x4f are REX prefixes so we need to look + * at the next byte instead.. but of course not recurse infinitely + */ + if (is_REX_prefix(insn)) + return is_IF_modifier(++insn); + + return 0; +} + +/* + * Adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * If it does, Return the address of the 32-bit displacement word. + * If not, return null. + * Only applicable to 64-bit x86. + */ +static void __kprobes fix_riprel(struct kprobe *p) +{ +#ifdef CONFIG_X86_64 + u8 *insn = p->ainsn.insn; + s64 disp; + int need_modrm; + + /* Skip legacy instruction prefixes. */ + while (1) { + switch (*insn) { + case 0x66: + case 0x67: + case 0x2e: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x36: + case 0xf0: + case 0xf3: + case 0xf2: + ++insn; + continue; + } + break; + } + + /* Skip REX instruction prefix. */ + if (is_REX_prefix(insn)) + ++insn; + + if (*insn == 0x0f) { + /* Two-byte opcode. */ + ++insn; + need_modrm = test_bit(*insn, + (unsigned long *)twobyte_has_modrm); + } else + /* One-byte opcode. */ + need_modrm = test_bit(*insn, + (unsigned long *)onebyte_has_modrm); + + if (need_modrm) { + u8 modrm = *++insn; + if ((modrm & 0xc7) == 0x05) { + /* %rip+disp32 addressing mode */ + /* Displacement follows ModRM byte. */ + ++insn; + /* + * The copied instruction uses the %rip-relative + * addressing mode. Adjust the displacement for the + * difference between the original location of this + * instruction and the location of the copy that will + * actually be run. The tricky bit here is making sure + * that the sign extension happens correctly in this + * calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the + * %rip value and yield the same 64-bit result that the + * sign-extension of the original signed 32-bit + * displacement would have given. + */ + disp = (u8 *) p->addr + *((s32 *) insn) - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ + *(s32 *)insn = (s32) disp; + } + } +#endif +} + +static void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + + fix_riprel(p); + + if (can_boost(p->addr)) + p->ainsn.boostable = 0; + else + p->ainsn.boostable = -1; + + p->opcode = *p->addr; +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on x86. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + arch_copy_kprobe(p); + return 0; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, &p->opcode, 1); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + mutex_lock(&kprobe_mutex); + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + mutex_unlock(&kprobe_mutex); +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags; + kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; + kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_flags = kcb->kprobe_old_flags + = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + if (is_IF_modifier(p->ainsn.insn)) + kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; +} + +static void __kprobes clear_btf(void) +{ + if (test_thread_flag(TIF_DEBUGCTLMSR)) + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); +} + +static void __kprobes restore_btf(void) +{ + if (test_thread_flag(TIF_DEBUGCTLMSR)) + wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr); +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + unsigned long *sara = stack_addr(regs); + + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} + +static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) + if (p->ainsn.boostable == 1 && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); + regs->ip = (unsigned long)p->ainsn.insn; + preempt_enable_no_resched(); + return; + } +#endif + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; +} + +/* + * We have reentered the kprobe_handler(), since another probe was hit while + * within the handler. We save the original kprobes variables and just single + * step on the instruction of the new probe without calling any user handlers. + */ +static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: +#ifdef CONFIG_X86_64 + /* TODO: Provide re-entrancy from post_kprobes_handler() and + * avoid exception stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->ip = (unsigned long)p->addr; + reset_current_kprobe(); + preempt_enable_no_resched(); + break; +#endif + case KPROBE_HIT_ACTIVE: + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + break; + case KPROBE_HIT_SS: + if (p == kprobe_running()) { + regs->flags &= ~TF_MASK; + regs->flags |= kcb->kprobe_saved_flags; + return 0; + } else { + /* A probe has been hit in the codepath leading up + * to, or just after, single-stepping of a probed + * instruction. This entire codepath should strictly + * reside in .kprobes.text section. Raise a warning + * to highlight this peculiar case. + */ + } + default: + /* impossible cases */ + WARN_ON(1); + return 0; + } + + return 1; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +static int __kprobes kprobe_handler(struct pt_regs *regs) +{ + kprobe_opcode_t *addr; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + return 1; + } + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing. We conditionally + * re-enable preemption at the end of this function, + * and also in reenter_kprobe() and setup_singlestep(). + */ + preempt_disable(); + + kcb = get_kprobe_ctlblk(); + p = get_kprobe(addr); + + if (p) { + if (kprobe_running()) { + if (reenter_kprobe(p, regs, kcb)) + return 1; + } else { + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it prepped + * for calling the break_handler below on re-entry + * for jprobe processing, so get out doing nothing + * more here. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb); + return 1; + } + } else if (kprobe_running()) { + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + setup_singlestep(p, regs, kcb); + return 1; + } + } /* else: not a kprobe fault; let the kernel handle it */ + + preempt_enable_no_resched(); + return 0; +} + +/* + * When a retprobed function returns, this code saves registers and + * calls trampoline_handler() runs, which calls the kretprobe's handler. + */ +void __kprobes kretprobe_trampoline_holder(void) +{ + asm volatile ( + ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subq $24, %rsp\n" + " pushq %rdi\n" + " pushq %rsi\n" + " pushq %rdx\n" + " pushq %rcx\n" + " pushq %rax\n" + " pushq %r8\n" + " pushq %r9\n" + " pushq %r10\n" + " pushq %r11\n" + " pushq %rbx\n" + " pushq %rbp\n" + " pushq %r12\n" + " pushq %r13\n" + " pushq %r14\n" + " pushq %r15\n" + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + " popq %r15\n" + " popq %r14\n" + " popq %r13\n" + " popq %r12\n" + " popq %rbp\n" + " popq %rbx\n" + " popq %r11\n" + " popq %r10\n" + " popq %r9\n" + " popq %r8\n" + " popq %rax\n" + " popq %rcx\n" + " popq %rdx\n" + " popq %rsi\n" + " popq %rdi\n" + /* Skip orig_ax, ip, cs */ + " addq $24, %rsp\n" + " popfq\n" +#else + " pushf\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subl $12, %esp\n" + " pushl %fs\n" + " pushl %ds\n" + " pushl %es\n" + " pushl %eax\n" + " pushl %ebp\n" + " pushl %edi\n" + " pushl %esi\n" + " pushl %edx\n" + " pushl %ecx\n" + " pushl %ebx\n" + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ + " movl 52(%esp), %edx\n" + " movl %edx, 48(%esp)\n" + /* Replace saved flags with true return address. */ + " movl %eax, 52(%esp)\n" + " popl %ebx\n" + " popl %ecx\n" + " popl %edx\n" + " popl %esi\n" + " popl %edi\n" + " popl %ebp\n" + " popl %eax\n" + /* Skip ip, orig_ax, es, ds, fs */ + " addl $20, %esp\n" + " popf\n" +#endif + " ret\n"); +} + +/* + * Called from kretprobe_trampoline + */ +void * __kprobes trampoline_handler(struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + /* fixup registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); +#endif + regs->ip = trampoline_address; + regs->orig_ax = ~0UL; + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more then one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) { + __get_cpu_var(current_kprobe) = &ri->rp->kp; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->rp->handler(ri, regs); + __get_cpu_var(current_kprobe) = NULL; + } + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + return (void *)orig_ret_address; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new ip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed flags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * If this is the first time we've single-stepped the instruction at + * this probepoint, and the instruction is boostable, boost it: add a + * jump instruction after the copied instruction, that jumps to the next + * instruction after the probepoint. + */ +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, struct kprobe_ctlblk *kcb) +{ + unsigned long *tos = stack_addr(regs); + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /*skip the REX prefix*/ + if (is_REX_prefix(insn)) + insn++; + + regs->flags &= ~X86_EFLAGS_TF; + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); + *tos |= kcb->kprobe_old_flags; + break; + case 0xc2: /* iret/ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.boostable = 1; + goto no_change; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_ip + (*tos - copy_ip); + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; +#endif + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; + } else if (((insn[1] & 0x31) == 0x20) || + ((insn[1] & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. And this is boostable + */ + p->ainsn.boostable = 1; + goto no_change; + } + default: + break; + } + + if (p->ainsn.boostable == 0) { + if ((regs->ip > copy_ip) && + (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + set_jmp_op((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); + p->ainsn.boostable = 1; + } else { + p->ainsn.boostable = -1; + } + } + + regs->ip += orig_ip - copy_ip; + +no_change: + restore_btf(); +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. + */ +static int __kprobes post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs, kcb); + regs->flags |= kcb->kprobe_saved_flags; + trace_hardirqs_fixup_flags(regs->flags); + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + return 1; +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->ip = (unsigned long)cur->addr; + regs->flags |= kcb->kprobe_old_flags; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + + /* + * fixup routine could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode_vm(args->regs)) + return ret; + + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_GPF: + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_sp = stack_addr(regs); + addr = (unsigned long)(kcb->jprobe_saved_sp); + + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); + regs->flags &= ~X86_EFLAGS_IF; + trace_hardirqs_off(); + regs->ip = (unsigned long)(jp->entry); + return 1; +} + +void __kprobes jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + asm volatile ( +#ifdef CONFIG_X86_64 + " xchg %%rbx,%%rsp \n" +#else + " xchgl %%ebx,%%esp \n" +#endif + " int3 \n" + " .globl jprobe_return_end\n" + " jprobe_return_end: \n" + " nop \n"::"b" + (kcb->jprobe_saved_sp):"memory"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + u8 *addr = (u8 *) (regs->ip - 1); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && + (addr < (u8 *) jprobe_return_end)) { + if (stack_addr(regs) != kcb->jprobe_saved_sp) { + struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; + printk(KERN_ERR + "current sp %p does not match saved sp %p\n", + stack_addr(regs), kcb->jprobe_saved_sp); + printk(KERN_ERR "Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk(KERN_ERR "Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), + kcb->jprobes_stack, + MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c deleted file mode 100644 index 3a020f79f82b..000000000000 --- a/arch/x86/kernel/kprobes_32.c +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Kernel Probes (KProbes) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston - * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> added function-return probes. - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/preempt.h> -#include <linux/kdebug.h> -#include <asm/cacheflush.h> -#include <asm/desc.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -struct kretprobe_blackpoint kretprobe_blacklist[] = { - {"__switch_to", }, /* This function switches only current task, but - doesn't switch kernel stack.*/ - {NULL, NULL} /* Terminator */ -}; -const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); - -/* insert a jmp code */ -static __always_inline void set_jmp_op(void *from, void *to) -{ - struct __arch_jmp_op { - char op; - long raddr; - } __attribute__((packed)) *jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (long)(to) - ((long)(from) + 5); - jop->op = RELATIVEJUMP_INSTRUCTION; -} - -/* - * returns non-zero if opcodes can be boosted. - */ -static __always_inline int can_boost(kprobe_opcode_t *opcodes) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 32)) - /* - * Undefined/reserved opcodes, conditional jump, Opcode Extension - * Groups, and some special opcodes can not be boost. - */ - static const unsigned long twobyte_is_boostable[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ - W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ - W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ - W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ - W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ - W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ - W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ - W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ - W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, twobyte_is_boostable); - } - - switch (opcode & 0xf0) { - case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); - case 0x70: - return 0; /* can't boost conditional jump */ - case 0xc0: - /* can't boost software-interruptions */ - return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; - case 0xd0: - /* can boost AA* and XLAT */ - return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); - case 0xe0: - /* can boost in/out and absolute jmps */ - return ((opcode & 0x04) || opcode == 0xea); - case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ - /* clear and set flags can be boost */ - return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); - default: - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ - /* can't boost CS override and call */ - return (opcode != 0x2e && opcode != 0x9a); - } -} - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static int __kprobes is_IF_modifier(kprobe_opcode_t opcode) -{ - switch (opcode) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on i386. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) - return -ENOMEM; - - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - p->opcode = *p->addr; - if (can_boost(p->addr)) { - p->ainsn.boostable = 0; - } else { - p->ainsn.boostable = -1; - } - return 0; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags; - kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags; - kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->opcode)) - kcb->kprobe_saved_eflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->eip = (unsigned long)p->addr; - else - regs->eip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)®s->esp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -/* - * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. - */ -static int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr; - struct kprobe_ctlblk *kcb; - - addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_eflags; - goto no_kprobe; - } - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the handler. - * We here save the original kprobes variables and - * just single step on the instruction of the new probe - * without calling any user handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->eip -= sizeof(kprobe_opcode_t); - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->eip -= sizeof(kprobe_opcode_t); - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) - if (p->ainsn.boostable == 1 && !p->post_handler){ - /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); - regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); - return 1; - } -#endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void __kprobes kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - " pushf\n" - /* skip cs, eip, orig_eax */ - " subl $12, %esp\n" - " pushl %fs\n" - " pushl %ds\n" - " pushl %es\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" - " movl %esp, %eax\n" - " call trampoline_handler\n" - /* move eflags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" - /* save true return address on eflags */ - " movl %eax, 52(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* skip eip, orig_eax, es, ds, fs */ - " addl $20, %esp\n" - " popf\n" - " ret\n"); -} - -/* - * Called from kretprobe_trampoline - */ -fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - /* fixup registers */ - regs->xcs = __KERNEL_CS | get_kernel_rpl(); - regs->eip = trampoline_address; - regs->orig_eax = 0xffffffff; - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler){ - __get_cpu_var(current_kprobe) = &ri->rp->kp; - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; - ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; - } - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void*)orig_ret_address; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new eip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - * - * This function also checks instruction size for preparing direct execution. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)®s->esp; - unsigned long copy_eip = (unsigned long)p->ainsn.insn; - unsigned long orig_eip = (unsigned long)p->addr; - - regs->eflags &= ~TF_MASK; - switch (p->ainsn.insn[0]) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_eflags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- eip is correct */ - /* eip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_eip + (*tos - copy_eip); - break; - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_eip + (*tos - copy_eip); - goto no_change; - case 0xff: - if ((p->ainsn.insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; eip is correct. - * But this is not boostable - */ - *tos = orig_eip + (*tos - copy_eip); - goto no_change; - } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* eip is correct. And this is boostable */ - p->ainsn.boostable = 1; - goto no_change; - } - default: - break; - } - - if (p->ainsn.boostable == 0) { - if ((regs->eip > copy_eip) && - (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - set_jmp_op((void *)regs->eip, - (void *)orig_eip + (regs->eip - copy_eip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - - regs->eip = orig_eip + (regs->eip - copy_eip); - -no_change: - return; -} - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. - */ -static int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_eflags; - trace_hardirqs_fixup_flags(regs->eflags); - - /*Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the eip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->eip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_eflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs)) - return 1; - - /* - * fixup_exception() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine to for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode_vm(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_esp = ®s->esp; - addr = (unsigned long)(kcb->jprobe_saved_esp); - - /* - * TBD: As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - trace_hardirqs_off(); - regs->eip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchgl %%ebx,%%esp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_esp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->eip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (®s->esp != kcb->jprobe_saved_esp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk("current esp %p does not match saved esp %p\n", - ®s->esp, kcb->jprobe_saved_esp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - return 0; -} - -int __init arch_init_kprobes(void) -{ - return 0; -} diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c deleted file mode 100644 index 5df19a9f9239..000000000000 --- a/arch/x86/kernel/kprobes_64.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * Kernel Probes (KProbes) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> adapted for x86_64 - * 2005-Mar Roland McGrath <roland@redhat.com> - * Fixed to handle %rip-relative addressing mode correctly. - * 2005-May Rusty Lynch <rusty.lynch@intel.com> - * Added function return probes functionality - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/preempt.h> -#include <linux/module.h> -#include <linux/kdebug.h> - -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); -static void __kprobes arch_copy_kprobe(struct kprobe *p); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -struct kretprobe_blackpoint kretprobe_blacklist[] = { - {"__switch_to", }, /* This function switches only current task, but - doesn't switch kernel stack.*/ - {NULL, NULL} /* Terminator */ -}; -const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) -{ - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) - return 1; - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on x86_64. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) { - return -ENOMEM; - } - arch_copy_kprobe(p); - return 0; -} - -/* - * Determine if the instruction uses the %rip-relative addressing mode. - * If it does, return the address of the 32-bit displacement word. - * If not, return null. - */ -static s32 __kprobes *is_riprel(u8 *insn) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 64)) - static const u64 onebyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ - W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ - W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ - W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ - W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ - W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ - W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ - W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ - W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ - W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ - W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ - W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ - W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ - W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; - static const u64 twobyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ - W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ - W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ - W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ - W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ - W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ - W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ - W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ - W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ - W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ - W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } - - /* Skip REX instruction prefix. */ - if ((*insn & 0xf0) == 0x40) - ++insn; - - if (*insn == 0x0f) { /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, twobyte_has_modrm); - } else { /* One-byte opcode. */ - need_modrm = test_bit(*insn, onebyte_has_modrm); - } - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - return (s32 *) ++insn; - } - } - - /* No %rip-relative addressing mode here. */ - return NULL; -} - -static void __kprobes arch_copy_kprobe(struct kprobe *p) -{ - s32 *ripdisp; - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); - ripdisp = is_riprel(p->ainsn.insn); - if (ripdisp) { - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *ripdisp = disp; - } - p->opcode = *p->addr; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; - kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; - kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kcb->kprobe_saved_rflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->rip = (unsigned long)p->addr; - else - regs->rip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)regs->rsp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); - struct kprobe_ctlblk *kcb; - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_rflags; - goto no_kprobe; - } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { - /* TODO: Provide re-entrancy from - * post_kprobes_handler() and avoid exception - * stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->rip = (unsigned long)p->addr; - reset_current_kprobe(); - ret = 1; - } else { - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the - * handler. We here save the original kprobe - * variables and just single step on instruction - * of the new probe without calling any user - * handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->rip = (unsigned long)addr; - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->rip = (unsigned long)addr; - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - "nop\n"); - } - -/* - * Called when we hit the probe point at kretprobe_trampoline - */ -int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler) - ri->rp->handler(ri, regs); - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - regs->rip = orig_ret_address; - - reset_current_kprobe(); - spin_unlock_irqrestore(&kretprobe_lock, flags); - preempt_enable_no_resched(); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new rip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long copy_rip = (unsigned long)p->ainsn.insn; - unsigned long orig_rip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /*skip the REX prefix*/ - if (*insn >= 0x40 && *insn <= 0x4f) - insn++; - - regs->eflags &= ~TF_MASK; - switch (*insn) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_rflags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_rip + (*tos - copy_rip); - break; - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* call absolute, indirect */ - /* Fix return addr; ip is correct. */ - *tos = orig_rip + (*tos - copy_rip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* ip is correct. */ - goto no_change; - } - default: - break; - } - - regs->rip = orig_rip + (regs->rip - copy_rip); -no_change: - - return; -} - -int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_rflags; - trace_hardirqs_fixup_flags(regs->eflags); - - /* Restore the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - const struct exception_table_entry *fixup; - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the rip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->rip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_rflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - - /* - * fixup() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_rsp = (long *) regs->rsp; - addr = (unsigned long)(kcb->jprobe_saved_rsp); - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - trace_hardirqs_off(); - regs->rip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchg %%rbx,%%rsp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_rsp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->rip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk("current rsp %p does not match saved rsp %p\n", - (long *)regs->rsp, kcb->jprobe_saved_rsp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - -int __init arch_init_kprobes(void) -{ - return register_kprobe(&trampoline_p); -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) - return 1; - - return 0; -} diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c index 9ff90a27c45f..8a7660c8394a 100644 --- a/arch/x86/kernel/ldt_32.c +++ b/arch/x86/kernel/ldt.c @@ -1,6 +1,9 @@ /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. */ #include <linux/errno.h> @@ -9,7 +12,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/vmalloc.h> -#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -17,7 +19,7 @@ #include <asm/desc.h> #include <asm/mmu_context.h> -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +#ifdef CONFIG_SMP static void flush_ldt(void *null) { if (current->active_mm) @@ -27,26 +29,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; + void *oldldt, *newldt; int oldsize; if (mincount <= pc->size) return 0; oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + mincount = (mincount + 511) & (~511); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = (void *)__get_free_page(GFP_KERNEL); if (!newldt) return -ENOMEM; if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif pc->ldt = newldt; wmb(); pc->size = mincount; @@ -55,6 +62,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -66,10 +74,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else - kfree(oldldt); + put_page(virt_to_page(oldldt)); } return 0; } @@ -77,9 +85,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); return 0; } @@ -89,7 +98,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) */ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - struct mm_struct * old_mm; + struct mm_struct *old_mm; int retval = 0; mutex_init(&mm->context.lock); @@ -105,33 +114,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) /* * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. */ void destroy_context(struct mm_struct *mm) { if (mm->context.size) { +#ifdef CONFIG_X86_32 + /* CHECKME: Can this ever happen ? */ if (mm == current->active_mm) clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) +#endif + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else - kfree(mm->context.ldt); + put_page(virt_to_page(mm->context.ldt)); mm->context.size = 0; } } -static int read_ldt(void __user * ptr, unsigned long bytecount) +static int read_ldt(void __user *ptr, unsigned long bytecount) { int err; unsigned long size; - struct mm_struct * mm = current->mm; + struct mm_struct *mm = current->mm; if (!mm->context.size) return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; + size = mm->context.size * LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -143,7 +157,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) goto error_return; if (size != bytecount) { /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { + if (clear_user(ptr + size, bytecount - size) != 0) { err = -EFAULT; goto error_return; } @@ -153,34 +167,32 @@ error_return: return err; } -static int read_default_ldt(void __user * ptr, unsigned long bytecount) +static int read_default_ldt(void __user *ptr, unsigned long bytecount) { - int err; - unsigned long size; - - err = 0; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (clear_user(ptr, size)) - err = -EFAULT; - - return err; + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; } -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2; + struct mm_struct *mm = current->mm; + struct desc_struct ldt; int error; struct user_desc ldt_info; error = -EINVAL; if (bytecount != sizeof(ldt_info)) goto out; - error = -EFAULT; + error = -EFAULT; if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) goto out; @@ -196,28 +208,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); if (error < 0) goto out_unlock; } - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; + memset(&ldt, 0, sizeof(ldt)); goto install; } } - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); + fill_ldt(&ldt, &ldt_info); if (oldmode) - entry_2 &= ~(1 << 20); + ldt.avl = 0; /* Install the new entry ... */ install: - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); + write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); error = 0; out_unlock: @@ -226,7 +237,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) { int ret = -ENOSYS; diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c deleted file mode 100644 index 60e57abb8e90..000000000000 --- a/arch/x86/kernel/ldt_64.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); - mask = cpumask_of_cpu(smp_processor_id()); - load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#else - load_LDT(pc); -#endif - } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - mutex_init(&mm->context.lock); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - *lp = entry_1; - *(lp+1) = entry_2; - error = 0; - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 11b935f4f886..c1cfd60639d4 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED; static void set_idt(void *newidt, __u16 limit) { - struct Xgt_desc_struct curidt; + struct desc_ptr curidt; /* ia32 supports unaliged loads & stores */ curidt.size = limit; @@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit) static void set_gdt(void *newgdt, __u16 limit) { - struct Xgt_desc_struct curgdt; + struct desc_ptr curgdt; /* ia32 supports unaligned loads & stores */ curgdt.size = limit; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index aa3d2c8f7737..a1fef42f8cdb 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(init_level4_pgt); - -#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3960ab7e1497..219f86eb6123 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s) } __setup("nomfgpt", mfgpt_disable); +/* Reset the MFGPT timers. This is required by some broken BIOSes which already + * do the same and leave the system in an unstable state. TinyBIOS 0.98 is + * affected at least (0.99 is OK with MFGPT workaround left to off). + */ +static int __init mfgpt_fix(char *s) +{ + u32 val, dummy; + + /* The following udocumented bit resets the MFGPT timers */ + val = 0xFF; dummy = 0; + wrmsr(0x5140002B, val, dummy); + return 1; +} +__setup("mfgptfix", mfgpt_fix); + /* * Check whether any MFGPTs are available for the kernel to use. In most * cases, firmware that uses AMD's VSA code will claim all timers during diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 09c315214a5e..6ff447f9fda7 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc) return 0; /* check extended signature checksum */ for (i = 0; i < ext_sigcount; i++) { - ext_sig = (struct extended_signature *)((void *)ext_header - + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; sum = orig_sum - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); @@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu) if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) return 0; - ext_header = (struct extended_sigtable *)(mc + - get_datasize(mc_header) + MC_HEADER_SIZE); + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; ext_sigcount = ext_header->count; - ext_sig = (struct extended_signature *)((void *)ext_header - + EXT_HEADER_SIZE); + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for (i = 0; i < ext_sigcount; i++) { if (microcode_update_match(cpu, mc_header, ext_sig->sig, ext_sig->pf)) @@ -436,7 +434,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ return -EINVAL; } - lock_cpu_hotplug(); + get_online_cpus(); mutex_lock(µcode_mutex); user_buffer = (void __user *) buf; @@ -447,7 +445,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ ret = (ssize_t)len; mutex_unlock(µcode_mutex); - unlock_cpu_hotplug(); + put_online_cpus(); return ret; } @@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu) pr_debug("ucode data file %s load failed\n", name); return error; } - buf = (void *)firmware->data; + buf = firmware->data; size = firmware->size; while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0) { @@ -658,14 +656,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) old = current->cpus_allowed; - lock_cpu_hotplug(); + get_online_cpus(); set_cpus_allowed(current, cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); if (uci->valid) err = cpu_request_microcode(cpu); mutex_unlock(µcode_mutex); - unlock_cpu_hotplug(); + put_online_cpus(); set_cpus_allowed(current, old); } if (err) @@ -817,9 +815,9 @@ static int __init microcode_init (void) return PTR_ERR(microcode_pdev); } - lock_cpu_hotplug(); + get_online_cpus(); error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - unlock_cpu_hotplug(); + put_online_cpus(); if (error) { microcode_dev_exit(); platform_device_unregister(microcode_pdev); @@ -839,9 +837,9 @@ static void __exit microcode_exit (void) unregister_hotcpu_notifier(&mc_cpu_notifier); - lock_cpu_hotplug(); + get_online_cpus(); sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - unlock_cpu_hotplug(); + put_online_cpus(); platform_device_unregister(microcode_pdev); } diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c index 7a05a7f6099a..67009cdd5eca 100644 --- a/arch/x86/kernel/mpparse_32.c +++ b/arch/x86/kernel/mpparse_32.c @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -unsigned int __cpuinitdata num_processors; +unsigned int num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m) if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", @@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) mps_oem_check(mpc, oem, str); - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + printk("APIC at: 0x%X\n", mpc->mpc_lapic); - /* + /* * Save the local APIC address (it might be non-default) -- but only * if we're not using ACPI. */ @@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) unsigned long *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); if (sizeof(*mpf) != 16) printk("Error: MPF size\n"); @@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) || (mpf->mpf_specification == 4)) ) { smp_found_config = 1; - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, virt_to_phys(mpf)); reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); if (mpf->mpf_physptr) { /* @@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) */ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + + mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); } void __init @@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void) } #define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; + static int pci_irq = IRQ_COMPRESSION_START; /* - * Mapping between Global System Interrups, which + * Mapping between Global System Interrupts, which * represent all possible interrupts, and IRQs * assigned to actual devices. */ @@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi_to_irq[gsi]; + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { /* * For PCI devices assign IRQs in order, avoiding gaps * due to unused I/O APIC pins. diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c index ef4aab123581..72ab1403fed7 100644 --- a/arch/x86/kernel/mpparse_64.c +++ b/arch/x86/kernel/mpparse_64.c @@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U; EXPORT_SYMBOL(boot_cpu_id); /* Internal processor count */ -unsigned int num_processors __cpuinitdata = 0; +unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata + = { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_bios_cpu_apicid_early_ptr; +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* @@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* - * bios_cpu_apicid is required to have processors listed + * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; } - bios_cpu_apicid[cpu] = m->mpc_apicid; - /* - * We get called early in the the start_kernel initialization - * process when the per_cpu data area is not yet setup, so we - * use a static array that is removed after the per_cpu data - * area is created. - */ - if (x86_cpu_to_apicid_ptr) { - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* are we being called early in kernel startup? */ + if (x86_cpu_to_apicid_early_ptr) { + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + + cpu_to_apicid[cpu] = m->mpc_apicid; + bios_cpu_apicid[cpu] = m->mpc_apicid; } else { per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; } cpu_set(cpu, cpu_possible_map); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index ee6eba4ecfea..21f6e3c0be18 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -155,15 +155,15 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: err = msr_device_create(cpu); break; case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: - case CPU_DEAD_FROZEN: msr_device_destroy(cpu); break; + case CPU_UP_CANCELED_FROZEN: + destroy_suspended_device(msr_class, MKDEV(MSR_MAJOR, cpu)); + break; } return err ? NOTIFY_BAD : NOTIFY_OK; } diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 852db2906921..edd413650b3b 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); static int endflag __initdata = 0; +#ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all * CPUs during the test make them busy. */ static __init void nmi_cpu_busy(void *data) { -#ifdef CONFIG_SMP local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, @@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data) care if they get somewhat less cycles. */ while (endflag == 0) mb(); -#endif } +#endif static int __init check_nmi_watchdog(void) { @@ -87,11 +87,13 @@ static int __init check_nmi_watchdog(void) printk(KERN_INFO "Testing NMI watchdog ... "); +#ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; + prev_nmi_count[cpu] = nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks @@ -176,7 +178,7 @@ static int lapic_nmi_resume(struct sys_device *dev) static struct sysdev_class nmi_sysclass = { - set_kset_name("lapic_nmi"), + .name = "lapic_nmi", .resume = lapic_nmi_resume, .suspend = lapic_nmi_suspend, }; @@ -237,10 +239,10 @@ void acpi_nmi_disable(void) on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); } -void setup_apic_nmi_watchdog (void *unused) +void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) - return; + return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ @@ -329,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - int rc=0; + int rc = 0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index 4253c4e8849c..fb99484d21cf 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -int panic_on_timeout; +static int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; @@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data) } #endif -int __init check_nmi_watchdog (void) +int __init check_nmi_watchdog(void) { - int *counts; + int *prev_nmi_count; int cpu; - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) return 0; if (!atomic_read(&nmi_active)) return 0; - counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!counts) + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) return -1; - printk(KERN_INFO "testing NMI watchdog ... "); + printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) @@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void) #endif for (cpu = 0; cpu < NR_CPUS; cpu++) - counts[cpu] = cpu_pda(cpu)->__nmi_count; + prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { + if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", - cpu, - counts[cpu], - cpu_pda(cpu)->__nmi_count); + cpu, + prev_nmi_count[cpu], + cpu_pda(cpu)->__nmi_count); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } + endflag = 1; if (!atomic_read(&nmi_active)) { - kfree(counts); + kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - endflag = 1; return -1; } - endflag = 1; printk("OK.\n"); /* now that we know it works we can reduce NMI frequency to @@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void) if (nmi_watchdog == NMI_LOCAL_APIC) nmi_hz = lapic_adjust_nmi_hz(1); - kfree(counts); + kfree(prev_nmi_count); return 0; } -int __init setup_nmi_watchdog(char *str) +static int __init setup_nmi_watchdog(char *str) { int nmi; @@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -211,13 +182,13 @@ static int lapic_nmi_resume(struct sys_device *dev) } static struct sysdev_class nmi_sysclass = { - set_kset_name("lapic_nmi"), + .name = "lapic_nmi", .resume = lapic_nmi_resume, .suspend = lapic_nmi_suspend, }; static struct sys_device device_lapic_nmi = { - .id = 0, + .id = 0, .cls = &nmi_sysclass, }; @@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void) if (nmi_watchdog != NMI_LOCAL_APIC) return 0; - if ( atomic_read(&nmi_active) < 0 ) + if (atomic_read(&nmi_active) < 0) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + void setup_apic_nmi_watchdog(void *unused) { - if (__get_cpu_var(wd_enabled) == 1) + if (__get_cpu_var(wd_enabled)) return; /* cheap hack to support suspend/resume */ @@ -311,8 +310,9 @@ void touch_nmi_watchdog(void) } } - touch_softlockup_watchdog(); + touch_softlockup_watchdog(); } +EXPORT_SYMBOL(touch_nmi_watchdog); int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { @@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 9000d82c6dc0..e65281b1634b 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void) { if (num_online_nodes() > 1) { printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } return 0; } diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c index f5000799f8ef..075962cc75ab 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt.c @@ -14,7 +14,10 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ + #include <linux/errno.h> #include <linux/module.h> #include <linux/efi.h> @@ -55,59 +58,9 @@ char *memory_setup(void) extern const char start_##ops##_##name[], end_##ops##_##name[]; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") -DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); -DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); -DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); -DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); -DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); -DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); -DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; -static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len) -{ - const unsigned char *start, *end; - unsigned ret; - - switch(type) { -#define SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - start = start_##ops##_##x; \ - end = end_##ops##_##x; \ - goto patch_site - - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, restore_fl); - SITE(pv_irq_ops, save_fl); - SITE(pv_cpu_ops, iret); - SITE(pv_cpu_ops, irq_enable_sysexit); - SITE(pv_mmu_ops, read_cr2); - SITE(pv_mmu_ops, read_cr3); - SITE(pv_mmu_ops, write_cr3); - SITE(pv_cpu_ops, clts); - SITE(pv_cpu_ops, read_tsc); -#undef SITE - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; - - default: - ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); - break; - } - - return ret; -} - unsigned paravirt_patch_nop(void) { return 0; @@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else @@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); +extern void native_irq_enable_syscall_ret(void); static int __init print_banner(void) { @@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); BUG_ON(preemptible()); - x86_write_percpu(paravirt_lazy_mode, mode); + __get_cpu_var(paravirt_lazy_mode) = mode; } void paravirt_leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); + BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); - x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } void paravirt_enter_lazy_mmu(void) @@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { - return x86_read_percpu(paravirt_lazy_mode); + return __get_cpu_var(paravirt_lazy_mode); } struct pv_info pv_info = { @@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = { .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, +#ifdef CONFIG_X86_64 + .read_cr8 = native_read_cr8, + .write_cr8 = native_write_cr8, +#endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, + .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, @@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, - .write_ldt_entry = write_dt_entry, - .write_gdt_entry = write_dt_entry, - .write_idt_entry = write_dt_entry, - .load_esp0 = native_load_esp0, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + .load_sp0 = native_load_sp0, - .irq_enable_sysexit = native_irq_enable_sysexit, + .irq_enable_syscall_ret = native_irq_enable_syscall_ret, .iret = native_iret, + .swapgs = native_swapgs, .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, @@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = { }; struct pv_mmu_ops pv_mmu_ops = { +#ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, @@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = { .kmap_atomic_pte = kmap_atomic, #endif +#if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, - .set_pud = native_set_pud, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, - +#endif + .set_pud = native_set_pud, .pmd_val = native_pmd_val, .make_pmd = native_make_pmd, + +#if PAGETABLE_LEVELS == 4 + .pud_val = native_pud_val, + .make_pud = native_make_pud, + .set_pgd = native_set_pgd, #endif +#endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, .pgd_val = native_pgd_val, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c new file mode 100644 index 000000000000..82fc5fcab4f4 --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -0,0 +1,49 @@ +#include <asm/paravirt.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_cpu_ops, read_tsc); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c new file mode 100644 index 000000000000..7d904e138d7e --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -0,0 +1,57 @@ +#include <asm/paravirt.h> +#include <asm/asm-offsets.h> +#include <linux/stringify.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); +DEF_NATIVE(pv_cpu_ops, iret, "iretq"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + +/* the three commands give us more control to how to return from a syscall */ +DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); +DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_cpu_ops, swapgs); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6bf1f716909d..21f34db2c03c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -30,7 +30,6 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/dma-mapping.h> -#include <linux/init.h> #include <linux/bitops.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -183,7 +182,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG -int debugging __read_mostly = 1; +static int debugging = 1; static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) @@ -202,7 +201,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, return ~0UL; } #else /* debugging is disabled */ -int debugging __read_mostly = 0; +static int debugging; static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c index 5552d23d23c2..a82473d192a3 100644 --- a/arch/x86/kernel/pci-dma_64.c +++ b/arch/x86/kernel/pci-dma_64.c @@ -13,7 +13,6 @@ #include <asm/calgary.h> int iommu_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_merge); dma_addr_t bad_dma_address __read_mostly; EXPORT_SYMBOL(bad_dma_address); @@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask); * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter * documentation. */ -__init int iommu_setup(char *p) +static __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 06bcba536045..4d5cc7181982 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -1,12 +1,12 @@ /* * Dynamic DMA mapping support for AMD Hammer. - * + * * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. * This allows to use PCI devices that only support 32bit addresses on systems - * with more than 4GB. + * with more than 4GB. * * See Documentation/DMA-mapping.txt for the interface specification. - * + * * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU General Public License v2 only. */ @@ -37,23 +37,26 @@ #include <asm/k8.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ -static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_size; /* size of remapping area bytes */ static unsigned long iommu_pages; /* .. and in pages */ -static u32 *iommu_gatt_base; /* Remapping table */ +static u32 *iommu_gatt_base; /* Remapping table */ -/* If this is disabled the IOMMU will use an optimized flushing strategy - of only flushing when an mapping is reused. With it true the GART is flushed - for every mapping. Problem is that doing the lazy flush seems to trigger - bugs with some popular PCI cards, in particular 3ware (but has been also - also seen with Qlogic at least). */ +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ int iommu_fullflush = 1; -/* Allocation bitmap for the remapping area */ +/* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); -static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; -static u32 gart_unmapped_entry; +static u32 gart_unmapped_entry; #define GPTE_VALID 1 #define GPTE_COHERENT 2 @@ -61,10 +64,10 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr,size) \ +#define to_pages(addr, size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) -#define EMERGENCY_PAGES 32 /* = 128KB */ +#define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP #define AGPEXTERN extern @@ -77,130 +80,152 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ +static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(int size) -{ +static unsigned long alloc_iommu(int size) +{ unsigned long offset, flags; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap, next_bit, + iommu_pages, size); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); + offset = find_next_zero_string(iommu_gart_bitmap, 0, + iommu_pages, size); } - if (offset != -1) { - set_bit_string(iommu_gart_bitmap, offset, size); - next_bit = offset+size; - if (next_bit >= iommu_pages) { + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { next_bit = 0; need_flush = 1; - } - } + } + } if (iommu_fullflush) need_flush = 1; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; -} +} static void free_iommu(unsigned long offset, int size) -{ +{ unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); __clear_bit_string(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} +} -/* +/* * Use global flush state to avoid races with multiple flushers. */ static void flush_gart(void) -{ +{ unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { k8_flush_garts(); need_flush = 0; - } + } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} +} #ifdef CONFIG_IOMMU_LEAK -#define SET_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0); -#define CLEAR_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; +#define SET_LEAK(x) \ + do { \ + if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0);\ + } while (0) + +#define CLEAR_LEAK(x) \ + do { \ + if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; \ + } while (0) /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; +static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; + static void dump_leak(void) { int i; - static int dump; - if (dump || !iommu_leak_tab) return; + static int dump; + + if (dump || !iommu_leak_tab) + return; dump = 1; - show_stack(NULL,NULL); - /* Very crude. dump some from the end of the table too */ - printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i+=2) { - printk("%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); - printk("%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk("\n"); + show_stack(NULL, NULL); + + /* Very crude. dump some from the end of the table too */ + printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", + iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i += 2) { + printk(KERN_DEBUG "%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); + printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk(KERN_DEBUG "\n"); } #else -#define SET_LEAK(x) -#define CLEAR_LEAK(x) +# define SET_LEAK(x) +# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) { - /* + /* * Ran out of IOMMU space for this operation. This is very bad. * Unfortunately the drivers cannot handle this operation properly. - * Return some non mapped prereserved space in the aperture and + * Return some non mapped prereserved space in the aperture and * let the Northbridge deal with it. This will result in garbage * in the IO operation. When the size exceeds the prereserved space - * memory corruption will occur or random memory will be DMAed + * memory corruption will occur or random memory will be DMAed * out. Hopefully no network devices use single mappings that big. - */ - - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); - } - + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR + "PCI-DMA: Random memory would be DMAed\n"); + } #ifdef CONFIG_IOMMU_LEAK - dump_leak(); + dump_leak(); #endif -} +} -static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) -{ +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ u64 mask = *dev->dma_mask; int high = addr + size > mask; int mmu = high; - if (force_iommu) - mmu = 1; - return mmu; + + if (force_iommu) + mmu = 1; + + return mmu; } -static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) -{ +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ u64 mask = *dev->dma_mask; int high = addr + size > mask; int mmu = high; - return mmu; + + return mmu; } /* Map a single continuous physical area into the IOMMU. @@ -208,13 +233,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) -{ +{ unsigned long npages = to_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(npages); int i; + if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) - return phys_mem; + return phys_mem; if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); @@ -229,35 +255,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); } -static dma_addr_t gart_map_simple(struct device *dev, char *buf, - size_t size, int dir) +static dma_addr_t +gart_map_simple(struct device *dev, char *buf, size_t size, int dir) { dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); + flush_gart(); + return map; } /* Map a single area into the IOMMU */ -static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +static dma_addr_t +gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; if (!dev) dev = &fallback_dev; - phys_mem = virt_to_phys(addr); + phys_mem = virt_to_phys(addr); if (!need_iommu(dev, phys_mem, size)) - return phys_mem; + return phys_mem; bus = gart_map_simple(dev, addr, size, dir); - return bus; + + return bus; } /* * Free a DMA mapping. */ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) + size_t size, int direction) { unsigned long iommu_page; int npages; @@ -266,6 +296,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || dma_addr >= iommu_bus_base + iommu_size) return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; npages = to_pages(dma_addr, size); for (i = 0; i < npages; i++) { @@ -278,7 +309,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, /* * Wrapper for pci_unmap_single working with scatterlists. */ -static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static void +gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { struct scatterlist *s; int i; @@ -303,12 +335,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, for_each_sg(sg, s, nents, i) { unsigned long addr = sg_phys(s); - if (nonforced_iommu(dev, addr, s->length)) { + + if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir); - if (addr == bad_dma_address) { - if (i > 0) + if (addr == bad_dma_address) { + if (i > 0) gart_unmap_sg(dev, sg, i, dir); - nents = 0; + nents = 0; sg[0].dma_length = 0; break; } @@ -317,15 +350,16 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, s->dma_length = s->length; } flush_gart(); + return nents; } /* Map multiple scatterlist entries continuous into the first. */ static int __dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, unsigned long pages) + struct scatterlist *sout, unsigned long pages) { unsigned long iommu_start = alloc_iommu(pages); - unsigned long iommu_page = iommu_start; + unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -335,32 +369,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems, for_each_sg(start, s, nelems, i) { unsigned long pages, addr; unsigned long phys_addr = s->dma_address; - + BUG_ON(s != start && s->offset); if (s == start) { sout->dma_address = iommu_bus_base; sout->dma_address += iommu_page*PAGE_SIZE + s->offset; sout->dma_length = s->length; - } else { - sout->dma_length += s->length; + } else { + sout->dma_length += s->length; } addr = phys_addr; - pages = to_pages(s->offset, s->length); - while (pages--) { - iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } - } - BUG_ON(iommu_page - iommu_start != pages); + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; } -static inline int dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, - unsigned long pages, int need) +static inline int +dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, + unsigned long pages, int need) { if (!need) { BUG_ON(nelems != 1); @@ -370,22 +405,19 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems, } return __dma_map_cont(start, nelems, sout, pages); } - + /* * DMA map all entries in a scatterlist. - * Merge chunks that have page aligned sizes into a continuous mapping. + * Merge chunks that have page aligned sizes into a continuous mapping. */ -static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - int dir) +static int +gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { - int i; - int out; - int start; - unsigned long pages = 0; - int need = 0, nextneed; struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; + unsigned long pages = 0; - if (nents == 0) + if (nents == 0) return 0; if (!dev) @@ -397,15 +429,19 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, ps = NULL; /* shut up gcc */ for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); + s->dma_address = addr; - BUG_ON(s->length == 0); + BUG_ON(s->length == 0); - nextneed = need_iommu(dev, addr, s->length); + nextneed = need_iommu(dev, addr, s->length); /* Handle the previous not yet processed entries */ if (i > start) { - /* Can only merge when the last chunk ends on a page - boundary and the new one doesn't have an offset. */ + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ if (!iommu_merge || !nextneed || !need || s->offset || (ps->offset + ps->length) % PAGE_SIZE) { if (dma_map_cont(start_sg, i - start, sgmap, @@ -436,6 +472,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, error: flush_gart(); gart_unmap_sg(dev, sg, out, dir); + /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { out = dma_map_sg_nonforce(dev, sg, nents, dir); @@ -444,64 +481,68 @@ error: } if (panic_on_overflow) panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) s->dma_address = bad_dma_address; return 0; -} +} static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) -{ - unsigned long a; - if (!iommu_size) { - iommu_size = aper_size; - if (!no_agp) - iommu_size /= 2; - } - - a = aper + iommu_size; +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; - if (iommu_size < 64*1024*1024) + if (iommu_size < 64*1024*1024) { printk(KERN_WARNING - "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); - + "PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + return iommu_size; -} +} -static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) -{ - unsigned aper_size = 0, aper_base_32; +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; u64 aper_base; - unsigned aper_order; - pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x94, &aper_base_32); pci_read_config_dword(dev, 0x90, &aper_order); - aper_order = (aper_order >> 1) & 7; + aper_order = (aper_order >> 1) & 7; - aper_base = aper_base_32 & 0x7fff; + aper_base = aper_base_32 & 0x7fff; aper_base <<= 25; - aper_size = (32 * 1024 * 1024) << aper_order; - if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) aper_base = 0; *size = aper_size; return aper_base; -} +} -/* +/* * Private Northbridge GATT initialization in case we cannot use the - * AGP driver for some reason. + * AGP driver for some reason. */ static __init int init_k8_gatt(struct agp_kern_info *info) -{ +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - unsigned aper_base, new_aper_base; - unsigned aper_size, gatt_size, new_aper_size; int i; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); @@ -509,75 +550,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info) dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { dev = k8_northbridges[i]; - new_aper_base = read_aperture(dev, &new_aper_size); - if (!new_aper_base) - goto nommu; - - if (!aper_base) { + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { aper_size = new_aper_size; aper_base = new_aper_base; - } - if (aper_size != new_aper_size || aper_base != new_aper_base) + } + if (aper_size != new_aper_size || aper_base != new_aper_base) goto nommu; } if (!aper_base) - goto nommu; + goto nommu; info->aper_base = aper_base; - info->aper_size = aper_size>>20; + info->aper_size = aper_size >> 20; - gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); - if (!gatt) + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) panic("Cannot allocate GATT table"); - if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) panic("Could not set GART PTEs to uncacheable pages"); - global_flush_tlb(); - memset(gatt, 0, gatt_size); + memset(gatt, 0, gatt_size); agp_gatt_table = gatt; for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; - u32 gatt_reg; + u32 gatt_reg; + u32 ctl; dev = k8_northbridges[i]; - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; + gatt_reg = __pa(gatt) >> 12; + gatt_reg <<= 4; pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); + pci_read_config_dword(dev, 0x90, &ctl); ctl |= 1; ctl &= ~((1<<4) | (1<<5)); - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, 0x90, ctl); } flush_gart(); - - printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + + printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); return 0; nommu: - /* Should not happen anymore */ + /* Should not happen anymore */ printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); - return -1; -} + return -1; +} extern int agp_amd64_init(void); static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, - .map_single = gart_map_single, - .map_simple = gart_map_simple, - .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, - .map_sg = gart_map_sg, - .unmap_sg = gart_unmap_sg, + .mapping_error = NULL, + .map_single = gart_map_single, + .map_simple = gart_map_simple, + .unmap_single = gart_unmap_single, + .sync_single_for_cpu = NULL, + .sync_single_for_device = NULL, + .sync_single_range_for_cpu = NULL, + .sync_single_range_for_device = NULL, + .sync_sg_for_cpu = NULL, + .sync_sg_for_device = NULL, + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, }; void gart_iommu_shutdown(void) @@ -588,23 +629,23 @@ void gart_iommu_shutdown(void) if (no_agp && (dma_ops != &gart_dma_ops)) return; - for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; - dev = k8_northbridges[i]; - pci_read_config_dword(dev, 0x90, &ctl); + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); - ctl &= ~1; + ctl &= ~1; - pci_write_config_dword(dev, 0x90, ctl); - } + pci_write_config_dword(dev, 0x90, ctl); + } } void __init gart_iommu_init(void) -{ +{ struct agp_kern_info info; - unsigned long aper_size; unsigned long iommu_start; + unsigned long aper_size; unsigned long scratch; long i; @@ -614,14 +655,14 @@ void __init gart_iommu_init(void) } #ifndef CONFIG_AGP_AMD64 - no_agp = 1; + no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ /* Add other K8 AGP bridge drivers here */ - no_agp = no_agp || - (agp_amd64_init() < 0) || + no_agp = no_agp || + (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); -#endif +#endif if (swiotlb) return; @@ -643,77 +684,78 @@ void __init gart_iommu_init(void) } printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); - aper_size = info.aper_size * 1024 * 1024; - iommu_size = check_iommu_size(info.aper_base, aper_size); - iommu_pages = iommu_size >> PAGE_SHIFT; - - iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages/8)); - if (!iommu_gart_bitmap) - panic("Cannot allocate iommu bitmap\n"); + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); memset(iommu_gart_bitmap, 0, iommu_pages/8); #ifdef CONFIG_IOMMU_LEAK - if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); else - printk("PCI-DMA: Cannot allocate leak trace area\n"); - } + printk(KERN_DEBUG + "PCI-DMA: Cannot allocate leak trace area\n"); + } #endif - /* + /* * Out of IOMMU space handling. - * Reserve some invalid pages at the beginning of the GART. - */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; + agp_memory_reserved = iommu_size; printk(KERN_INFO "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", - iommu_size>>20); + iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; bad_dma_address = iommu_bus_base; iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); - /* + /* * Unmap the IOMMU part of the GART. The alias of the page is * always mapped with cache enabled and there is no full cache * coherency across the GART remapping. The unmapping avoids * automatic prefetches from the CPU allocating cache lines in * there. All CPU accesses are done via the direct mapping to * the backing memory. The GART address is only used by PCI - * devices. + * devices. */ clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); - /* - * Try to workaround a bug (thanks to BenH) - * Set unmapped entries to a scratch page instead of 0. + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. * Any prefetches that hit unmapped entries won't get an bus abort * then. */ - scratch = get_zeroed_page(GFP_KERNEL); - if (!scratch) + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) panic("Cannot allocate iommu scratch page"); gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); - for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) iommu_gatt_base[i] = gart_unmapped_entry; flush_gart(); dma_ops = &gart_dma_ops; -} +} void __init gart_parse_options(char *p) { int arg; #ifdef CONFIG_IOMMU_LEAK - if (!strncmp(p,"leak",4)) { + if (!strncmp(p, "leak", 4)) { leak_trace = 1; p += 4; if (*p == '=') ++p; @@ -723,18 +765,18 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush",8)) + if (!strncmp(p, "fullflush", 8)) iommu_fullflush = 1; - if (!strncmp(p, "nofullflush",11)) + if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; - if (!strncmp(p,"noagp",5)) + if (!strncmp(p, "noagp", 5)) no_agp = 1; - if (!strncmp(p, "noaperture",10)) + if (!strncmp(p, "noaperture", 10)) fix_aperture = 0; /* duplicated from pci-dma.c */ - if (!strncmp(p,"force",5)) + if (!strncmp(p, "force", 5)) gart_iommu_aperture_allowed = 1; - if (!strncmp(p,"allowed",7)) + if (!strncmp(p, "allowed", 7)) gart_iommu_aperture_allowed = 1; if (!strncmp(p, "memaper", 7)) { fallback_aper_force = 1; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 102866d729a5..82a0a674a003 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -10,7 +10,6 @@ #include <asm/dma.h> int swiotlb __read_mostly; -EXPORT_SYMBOL(swiotlb); const struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c index ae8f91214f15..b112406f1996 100644 --- a/arch/x86/kernel/pmtimer_64.c +++ b/arch/x86/kernel/pmtimer_64.c @@ -19,13 +19,13 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/cpumask.h> +#include <linux/acpi_pmtmr.h> + #include <asm/io.h> #include <asm/proto.h> #include <asm/msr.h> #include <asm/vsyscall.h> -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - static inline u32 cyc2us(u32 cycles) { /* The Power Management Timer ticks at 3.579545 ticks per microsecond. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 46d391d49de8..968371ab223a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,7 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> +#include <asm/kdebug.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - return ((unsigned long *)tsk->thread.esp)[3]; + return ((unsigned long *)tsk->thread.sp)[3]; } /* @@ -113,10 +114,19 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { /* loop is done by the caller */ @@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { cpu_relax(); } @@ -188,6 +198,9 @@ void cpu_idle(void) rmb(); idle = pm_idle; + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, 0); + if (!idle) idle = default_idle; @@ -255,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __mwait(ax, cx); } } @@ -272,19 +285,37 @@ static void mwait_idle(void) mwait_idle_with_hints(0, 0); } +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_MWAIT)) { - printk("monitor/mwait feature present.\n"); + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ if (!pm_idle) { - printk("using mwait in idle threads.\n"); + printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; } } + selected = 1; } static int __init idle_setup(char *str) @@ -292,10 +323,6 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; -#ifdef CONFIG_X86_SMP - if (smp_num_siblings > 1) - printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); -#endif } else if (!strcmp(str, "mwait")) force_mwait = 1; else @@ -310,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long esp; + unsigned long sp; unsigned short ss, gs; if (user_mode_vm(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; + sp = regs->sp; + ss = regs->ss & 0xffff; savesegment(gs, gs); } else { - esp = (unsigned long) (®s->esp); + sp = (unsigned long) (®s->sp); savesegment(ss, ss); savesegment(gs, gs); } @@ -331,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all) init_utsname()->version); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - 0xffff & regs->xcs, regs->eip, regs->eflags, + 0xffff & regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); + regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); + regs->si, regs->di, regs->bp, sp); printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->ds & 0xffff, regs->es & 0xffff, + regs->fs & 0xffff, gs, ss); if (!all) return; @@ -369,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); - show_trace(NULL, regs, ®s->esp); + show_trace(NULL, regs, ®s->sp, regs->bp); } /* - * This gets run with %ebx containing the - * function to call, and %edx containing + * This gets run with %bx containing the + * function to call, and %dx containing * the "args". */ extern void kernel_thread_helper(void); @@ -388,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) memset(®s, 0, sizeof(regs)); - regs.ebx = (unsigned long) fn; - regs.edx = (unsigned long) arg; + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; - regs.xds = __USER_DS; - regs.xes = __USER_DS; - regs.xfs = __KERNEL_PERCPU; - regs.orig_eax = -1; - regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS | get_kernel_rpl(); - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); @@ -435,7 +462,12 @@ void flush_thread(void) { struct task_struct *tsk = current; - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); clear_tsk_thread_flag(tsk, TIF_DEBUG); /* @@ -460,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -470,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, childregs = task_pt_regs(p); *childregs = *regs; - childregs->eax = 0; - childregs->esp = esp; + childregs->ax = 0; + childregs->sp = sp; - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs,p->thread.gs); + savesegment(gs, p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -491,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, set_tsk_thread_flag(p, TIF_IO_BITMAP); } + err = 0; + /* * Set a new TLS for the child thread? */ - if (clone_flags & CLONE_SETTLS) { - struct desc_struct *desc; - struct user_desc info; - int idx; - - err = -EFAULT; - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) - goto out; - err = -EINVAL; - if (LDT_empty(&info)) - goto out; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - goto out; - - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); - err = 0; - out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; @@ -529,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, */ void dump_thread(struct pt_regs * regs, struct user * dump) { - int i; + u16 gs; /* changed the size calculations - should hopefully work better. lbt */ dump->magic = CMAGIC; dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - dump->regs.fs = regs->xfs; - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; + savesegment(gs,gs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; + dump->regs.ss = (u16)regs->ss; dump->u_fpvalid = dump_fpu (regs, &dump->i387); } EXPORT_SYMBOL(dump_thread); -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs ptregs = *task_pt_regs(tsk); - ptregs.xcs &= 0xffff; - ptregs.xds &= 0xffff; - ptregs.xes &= 0xffff; - ptregs.xss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - #ifdef CONFIG_SECCOMP -void hard_disable_TSC(void) +static void hard_disable_TSC(void) { write_cr4(read_cr4() | X86_CR4_TSD); } @@ -599,7 +604,7 @@ void disable_TSC(void) hard_disable_TSC(); preempt_enable(); } -void hard_enable_TSC(void) +static void hard_enable_TSC(void) { write_cr4(read_cr4() & ~X86_CR4_TSD); } @@ -609,18 +614,32 @@ static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { - struct thread_struct *next; + struct thread_struct *prev, *next; + unsigned long debugctl; + prev = &prev_p->thread; next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); + } + + if (next->debugctlmsr != debugctl) + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); /* no 4 and 5 */ - set_debugreg(next->debugreg[6], 6); - set_debugreg(next->debugreg[7], 7); + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); } #ifdef CONFIG_SECCOMP @@ -634,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } #endif + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); + + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -687,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * More important, however, is the fact that this allows us much * more flexibility. * - * The return value (in %eax) will be the "prev" task after + * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -710,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Reload esp0. */ - load_esp0(tss, next); + load_sp0(tss, next); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -774,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } asmlinkage int sys_clone(struct pt_regs regs) @@ -783,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs) unsigned long newsp; int __user *parent_tidptr, *child_tidptr; - clone_flags = regs.ebx; - newsp = regs.ecx; - parent_tidptr = (int __user *)regs.edx; - child_tidptr = (int __user *)regs.edi; + clone_flags = regs.bx; + newsp = regs.cx; + parent_tidptr = (int __user *)regs.dx; + child_tidptr = (int __user *)regs.di; if (!newsp) - newsp = regs.esp; + newsp = regs.sp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); } @@ -804,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs) */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } /* @@ -815,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs) int error; char * filename; - filename = getname((char __user *) regs.ebx); + filename = getname((char __user *) regs.bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) regs.ecx, - (char __user * __user *) regs.edx, + (char __user * __user *) regs.cx, + (char __user * __user *) regs.dx, ®s); if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); } @@ -840,145 +863,37 @@ out: unsigned long get_wchan(struct task_struct *p) { - unsigned long ebp, esp, eip; + unsigned long bp, sp, ip; unsigned long stack_page; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; stack_page = (unsigned long)task_stack_page(p); - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; do { - if (ebp < stack_page || ebp > top_ebp+stack_page) + if (bp < stack_page || bp > top_ebp+stack_page) return 0; - eip = *(unsigned long *) (ebp+4); - if (!in_sched_functions(eip)) - return eip; - ebp = *(unsigned long *) ebp; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; } while (count++ < 16); return 0; } -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty(t->tls_array + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - */ -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -{ - struct thread_struct *t = ¤t->thread; - struct user_desc info; - struct desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); - - put_cpu(); - - return 0; -} - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -{ - struct user_desc info; - struct desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - memset(&info, 0, sizeof(info)); - - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ab79e1dfa023..137a86171c39 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -3,7 +3,7 @@ * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * + * * X86-64 port * Andi Kleen. * @@ -19,19 +19,19 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/fs.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/module.h> #include <linux/a.out.h> #include <linux/interrupt.h> +#include <linux/utsname.h> #include <linux/delay.h> +#include <linux/module.h> #include <linux/ptrace.h> -#include <linux/utsname.h> #include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> @@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n) { atomic_notifier_chain_register(&idle_notifier, n); } -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL(idle_notifier_unregister); void enter_idle(void) { @@ -106,7 +99,7 @@ void exit_idle(void) * We use this if we don't have any better * idle routine.. */ -static void default_idle(void) +void default_idle(void) { current_thread_info()->status &= ~TS_POLLING; /* @@ -116,11 +109,18 @@ static void default_idle(void) smp_mb(); local_irq_disable(); if (!need_resched()) { - /* Enables interrupts one instruction before HLT. - x86 special cases this so there is no race. */ - safe_halt(); - } else - local_irq_enable(); + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); + safe_halt(); /* enables interrupts racelessly */ + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -129,54 +129,12 @@ static void default_idle(void) * to poll the ->need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); cpu_relax(); } -static void do_nothing(void *unused) -{ -} - -void cpu_idle_wait(void) -{ - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && - !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - #ifdef CONFIG_HOTPLUG_CPU DECLARE_PER_CPU(int, cpu_state); @@ -207,19 +165,18 @@ static inline void play_dead(void) * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; - tick_nohz_stop_sched_tick(); - rmb(); idle = pm_idle; if (!idle) @@ -247,6 +204,47 @@ void cpu_idle (void) } } +static void do_nothing(void *unused) +{ +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map, tmp = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + /* + * We waited 1 sec, if a CPU still did not call idle + * it may be because it is in idle and not waking up + * because it has nothing to do. + * Give all the remaining CPUS a kick. + */ + smp_call_function_mask(map, do_nothing, 0, 0); + } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. @@ -257,13 +255,13 @@ void cpu_idle (void) * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __mwait(ax, cx); } } @@ -282,25 +280,41 @@ static void mwait_idle(void) } } + +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - static int printed; - if (cpu_has(c, X86_FEATURE_MWAIT)) { + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ if (!pm_idle) { - if (!printed) { - printk(KERN_INFO "using mwait in idle threads.\n"); - printed = 1; - } + printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; } } + selected = 1; } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); @@ -315,13 +329,13 @@ static int __init idle_setup (char *str) } early_param("idle", idle_setup); -/* Prints also some state that isn't saved in the pt_regs */ +/* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; printk("\n"); print_modules(); @@ -330,16 +344,16 @@ void __show_regs(struct pt_regs * regs) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, + regs->flags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); + regs->ax, regs->bx, regs->cx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); + regs->dx, regs->si, regs->di); printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); + regs->bp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", @@ -379,7 +393,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } /* @@ -390,7 +404,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; - if (me->thread.io_bitmap_ptr) { + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); kfree(t->io_bitmap_ptr); @@ -426,7 +440,7 @@ void flush_thread(void) tsk->thread.debugreg3 = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. */ @@ -449,26 +463,21 @@ void release_thread(struct task_struct *dead_task) static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) { - struct user_desc ud = { + struct user_desc ud = { .base_addr = addr, .limit = 0xfffff, .seg_32bit = 1, .limit_in_pages = 1, .useable = 1, }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; + struct desc_struct *desc = t->thread.tls_array; desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); + fill_ldt(desc, &ud); } static inline u32 read_32bit_tls(struct task_struct *t, int tls) { - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); + return get_desc_base(&t->thread.tls_array[tls]); } /* @@ -480,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -492,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, (THREAD_SIZE + task_stack_page(p))) - 1; *childregs = *regs; - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) - childregs->rsp = (unsigned long)childregs; + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); @@ -520,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); - } + } /* * Set a new TLS for the child thread? @@ -528,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); else #endif err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); @@ -547,17 +557,30 @@ out: /* * This special macro can be used to load a debugging register */ -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p, - struct tss_struct *tss) + struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *prev, *next; + unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + } + + if (next->debugctlmsr != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); @@ -581,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); } /* * switch_to(x,y) should switch tasks from x to y. * - * This could still be optimized: + * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * @@ -597,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); /* we're going to use this soon, after a few expensive things */ @@ -607,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0, LDT and the page table pointer: */ - tss->rsp0 = next->rsp0; + load_sp0(tss, next); /* * Switch DS and ES. @@ -666,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); + prev->usersp = read_pda(oldrsp); + write_pda(oldrsp, next->usersp); write_pda(pcurrent, next_p); write_pda(kernelstack, @@ -684,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Now maybe reload the debug registers and handle I/O bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); /* If the task has used fpu the last 5 timeslices, just do a full @@ -700,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * sys_execve() executes a new program. */ -asmlinkage +asmlinkage long sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs regs) { @@ -712,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv, if (IS_ERR(filename)) return error; error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } putname(filename); return error; } @@ -726,18 +750,18 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_IA32); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that - 32bit childs are affected again. */ + 32bit childs are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } asmlinkage long sys_fork(struct pt_regs *regs) { - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } asmlinkage long @@ -745,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) - newsp = regs->rsp; + newsp = regs->sp; return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } @@ -761,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, */ asmlinkage long sys_vfork(struct pt_regs *regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,rip; + u64 fp,ip; int count = 0; if (!p || p == current || p->state==TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) return 0; - fp = *(u64 *)(p->thread.rsp); + fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; fp = *(u64 *)fp; } while (count++ < 16); return 0; @@ -824,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* Not strictly needed for fs, but do it for symmetry with gs */ if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { + if (addr <= 0xffffffff) { set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); + if (doit) { + load_TLS(&task->thread, cpu); asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; - } else { + } else { task->thread.fsindex = 0; task->thread.fs = addr; if (doit) { @@ -848,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } put_cpu(); break; - case ARCH_GET_FS: { - unsigned long base; + case ARCH_GET_FS: { + unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); else if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; + ret = put_user(base, (unsigned long __user *)addr); + break; } - case ARCH_GET_GS: { + case ARCH_GET_GS: { unsigned long base; unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); if (gsindex) rdmsrl(MSR_KERNEL_GS_BASE, base); else @@ -873,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } else base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)addr); break; } default: ret = -EINVAL; break; - } + } - return ret; -} + return ret; +} long sys_arch_prctl(int code, unsigned long addr) { return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = task_pt_regs(tsk); - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; } unsigned long arch_align_stack(unsigned long sp) @@ -914,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c new file mode 100644 index 000000000000..96286df1bb81 --- /dev/null +++ b/arch/x86/kernel/ptrace.c @@ -0,0 +1,1545 @@ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * BTS tracing + * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/regset.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/debugreg.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/prctl.h> +#include <asm/proto.h> +#include <asm/ds.h> + +#include "tls.h" + +enum x86_regset { + REGSET_GENERAL, + REGSET_FP, + REGSET_XFP, + REGSET_TLS, +}; + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + */ +#define FLAG_MASK_32 ((unsigned long) \ + (X86_EFLAGS_CF | X86_EFLAGS_PF | \ + X86_EFLAGS_AF | X86_EFLAGS_ZF | \ + X86_EFLAGS_SF | X86_EFLAGS_TF | \ + X86_EFLAGS_DF | X86_EFLAGS_OF | \ + X86_EFLAGS_RF | X86_EFLAGS_AC)) + +/* + * Determines whether a value may be installed in a segment register. + */ +static inline bool invalid_selector(u16 value) +{ + return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); +} + +#ifdef CONFIG_X86_32 + +#define FLAG_MASK FLAG_MASK_32 + +static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); + regno >>= 2; + if (regno > FS) + --regno; + return ®s->bx + regno; +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int retval; + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { + retval = task->thread.gs; + if (task == current) + savesegment(gs, retval); + } + return retval; +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + if (offset != offsetof(struct user_regs_struct, gs)) + *pt_regs_access(task_pt_regs(task), offset) = value; + else { + task->thread.gs = value; + if (task == current) + /* + * The user-mode %gs is not affected by + * kernel entry, so we must update the CPU. + */ + loadsegment(gs, value); + } + + return 0; +} + +static unsigned long debugreg_addr_limit(struct task_struct *task) +{ + return TASK_SIZE - 3; +} + +#else /* CONFIG_X86_64 */ + +#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); + return ®s->r15 + (offset / sizeof(regs->r15)); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int seg; + + switch (offset) { + case offsetof(struct user_regs_struct, fs): + if (task == current) { + /* Older gas can't assemble movq %?s,%r?? */ + asm("movl %%fs,%0" : "=r" (seg)); + return seg; + } + return task->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + if (task == current) { + asm("movl %%gs,%0" : "=r" (seg)); + return seg; + } + return task->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + if (task == current) { + asm("movl %%ds,%0" : "=r" (seg)); + return seg; + } + return task->thread.ds; + case offsetof(struct user_regs_struct, es): + if (task == current) { + asm("movl %%es,%0" : "=r" (seg)); + return seg; + } + return task->thread.es; + + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + break; + } + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + switch (offset) { + case offsetof(struct user_regs_struct,fs): + /* + * If this is setting fs as for normal 64-bit use but + * setting fs_base has implicitly changed it, leave it. + */ + if ((value == FS_TLS_SEL && task->thread.fsindex == 0 && + task->thread.fs != 0) || + (value == 0 && task->thread.fsindex == FS_TLS_SEL && + task->thread.fs == 0)) + break; + task->thread.fsindex = value; + if (task == current) + loadsegment(fs, task->thread.fsindex); + break; + case offsetof(struct user_regs_struct,gs): + /* + * If this is setting gs as for normal 64-bit use but + * setting gs_base has implicitly changed it, leave it. + */ + if ((value == GS_TLS_SEL && task->thread.gsindex == 0 && + task->thread.gs != 0) || + (value == 0 && task->thread.gsindex == GS_TLS_SEL && + task->thread.gs == 0)) + break; + task->thread.gsindex = value; + if (task == current) + load_gs_index(task->thread.gsindex); + break; + case offsetof(struct user_regs_struct,ds): + task->thread.ds = value; + if (task == current) + loadsegment(ds, task->thread.ds); + break; + case offsetof(struct user_regs_struct,es): + task->thread.es = value; + if (task == current) + loadsegment(es, task->thread.es); + break; + + /* + * Can't actually change these in 64-bit mode. + */ + case offsetof(struct user_regs_struct,cs): +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->cs = value; +#endif + break; + case offsetof(struct user_regs_struct,ss): +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->ss = value; +#endif + break; + } + + return 0; +} + +static unsigned long debugreg_addr_limit(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + return IA32_PAGE_OFFSET - 3; +#endif + return TASK_SIZE64 - 7; +} + +#endif /* CONFIG_X86_32 */ + +static unsigned long get_flags(struct task_struct *task) +{ + unsigned long retval = task_pt_regs(task)->flags; + + /* + * If the debugger set TF, hide it from the readout. + */ + if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + retval &= ~X86_EFLAGS_TF; + + return retval; +} + +static int set_flags(struct task_struct *task, unsigned long value) +{ + struct pt_regs *regs = task_pt_regs(task); + + /* + * If the user value contains TF, mark that + * it was not "us" (the debugger) that set it. + * If not, make sure it stays set if we had. + */ + if (value & X86_EFLAGS_TF) + clear_tsk_thread_flag(task, TIF_FORCED_TF); + else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + value |= X86_EFLAGS_TF; + + regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); + + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long offset, unsigned long value) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return set_segment_reg(child, offset, value); + + case offsetof(struct user_regs_struct, flags): + return set_flags(child, value); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_OF(child)) + return -EIO; + /* + * When changing the segment base, use do_arch_prctl + * to set either thread.fs or thread.fsindex and the + * corresponding GDT slot. + */ + if (child->thread.fs != value) + return do_arch_prctl(child, ARCH_SET_FS, value); + return 0; + case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ + if (value >= TASK_SIZE_OF(child)) + return -EIO; + if (child->thread.gs != value) + return do_arch_prctl(child, ARCH_SET_GS, value); + return 0; +#endif + } + + *pt_regs_access(task_pt_regs(child), offset) = value; + return 0; +} + +static unsigned long getreg(struct task_struct *task, unsigned long offset) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return get_segment_reg(task, offset); + + case offsetof(struct user_regs_struct, flags): + return get_flags(task); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct, fs_base): { + /* + * do_arch_prctl may have used a GDT slot instead of + * the MSR. To userland, it appears the same either + * way, except the %fs segment selector might not be 0. + */ + unsigned int seg = task->thread.fsindex; + if (task->thread.fs != 0) + return task->thread.fs; + if (task == current) + asm("movl %%fs,%0" : "=r" (seg)); + if (seg != FS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[FS_TLS]); + } + case offsetof(struct user_regs_struct, gs_base): { + /* + * Exactly the same here as the %fs handling above. + */ + unsigned int seg = task->thread.gsindex; + if (task->thread.gs != 0) + return task->thread.gs; + if (task == current) + asm("movl %%gs,%0" : "=r" (seg)); + if (seg != GS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[GS_TLS]); + } +#endif + } + + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + unsigned long *k = kbuf; + while (count > 0) { + *k++ = getreg(target, pos); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + unsigned long __user *u = ubuf; + while (count > 0) { + if (__put_user(getreg(target, pos), u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const unsigned long *k = kbuf; + while (count > 0 && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count > 0 && !ret) { + unsigned long word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +/* + * This function is trivial and will be inlined by the compiler. + * Having it separates the implementation details of debug + * registers from the interface details of ptrace. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +{ + switch (n) { + case 0: return child->thread.debugreg0; + case 1: return child->thread.debugreg1; + case 2: return child->thread.debugreg2; + case 3: return child->thread.debugreg3; + case 6: return child->thread.debugreg6; + case 7: return child->thread.debugreg7; + } + return 0; +} + +static int ptrace_set_debugreg(struct task_struct *child, + int n, unsigned long data) +{ + int i; + + if (unlikely(n == 4 || n == 5)) + return -EIO; + + if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) + return -EIO; + + switch (n) { + case 0: child->thread.debugreg0 = data; break; + case 1: child->thread.debugreg1 = data; break; + case 2: child->thread.debugreg2 = data; break; + case 3: child->thread.debugreg3 = data; break; + + case 6: + if ((data & ~0xffffffffUL) != 0) + return -EIO; + child->thread.debugreg6 = data; + break; + + case 7: + /* + * Sanity-check data. Take one half-byte at once with + * check = (val >> (16 + 4*i)) & 0xf. It contains the + * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits + * 2 and 3 are LENi. Given a list of invalid values, + * we do mask |= 1 << invalid_value, so that + * (mask >> check) & 1 is a correct test for invalid + * values. + * + * R/Wi contains the type of the breakpoint / + * watchpoint, LENi contains the length of the watched + * data in the watchpoint case. + * + * The invalid values are: + * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] + * - R/Wi == 0x10 (break on I/O reads or writes), so + * mask |= 0x4444. + * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= + * 0x1110. + * + * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. + * + * See the Intel Manual "System Programming Guide", + * 15.2.4 + * + * Note that LENi == 0x10 is defined on x86_64 in long + * mode (i.e. even for 32-bit userspace software, but + * 64-bit kernel), so the x86_64 mask value is 0x5454. + * See the AMD manual no. 24593 (AMD64 System Programming) + */ +#ifdef CONFIG_X86_32 +#define DR7_MASK 0x5f54 +#else +#define DR7_MASK 0x5554 +#endif + data &= ~DR_CONTROL_RESERVED; + for (i = 0; i < 4; i++) + if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) + return -EIO; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); + break; + } + + return 0; +} + +static int ptrace_bts_get_size(struct task_struct *child) +{ + if (!child->thread.ds_area_msr) + return -ENXIO; + + return ds_get_bts_index((void *)child->thread.ds_area_msr); +} + +static int ptrace_bts_read_record(struct task_struct *child, + long index, + struct bts_struct __user *out) +{ + struct bts_struct ret; + int retval; + int bts_end; + int bts_index; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + if (index < 0) + return -EINVAL; + + bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); + if (bts_end <= index) + return -EINVAL; + + /* translate the ptrace bts index into the ds bts index */ + bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); + bts_index -= (index + 1); + if (bts_index < 0) + bts_index += bts_end; + + retval = ds_read_bts((void *)child->thread.ds_area_msr, + bts_index, &ret); + if (retval < 0) + return retval; + + if (copy_to_user(out, &ret, sizeof(ret))) + return -EFAULT; + + return sizeof(ret); +} + +static int ptrace_bts_write_record(struct task_struct *child, + const struct bts_struct *in) +{ + int retval; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + retval = ds_write_bts((void *)child->thread.ds_area_msr, in); + if (retval) + return retval; + + return sizeof(*in); +} + +static int ptrace_bts_clear(struct task_struct *child) +{ + if (!child->thread.ds_area_msr) + return -ENXIO; + + return ds_clear((void *)child->thread.ds_area_msr); +} + +static int ptrace_bts_drain(struct task_struct *child, + long size, + struct bts_struct __user *out) +{ + int end, i; + void *ds = (void *)child->thread.ds_area_msr; + + if (!ds) + return -ENXIO; + + end = ds_get_bts_index(ds); + if (end <= 0) + return end; + + if (size < (end * sizeof(struct bts_struct))) + return -EIO; + + for (i = 0; i < end; i++, out++) { + struct bts_struct ret; + int retval; + + retval = ds_read_bts(ds, i, &ret); + if (retval < 0) + return retval; + + if (copy_to_user(out, &ret, sizeof(ret))) + return -EFAULT; + } + + ds_clear(ds); + + return end; +} + +static int ptrace_bts_realloc(struct task_struct *child, + int size, int reduce_size) +{ + unsigned long rlim, vm; + int ret, old_size; + + if (size < 0) + return -EINVAL; + + old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); + if (old_size < 0) + return old_size; + + ret = ds_free((void **)&child->thread.ds_area_msr); + if (ret < 0) + goto out; + + size >>= PAGE_SHIFT; + old_size >>= PAGE_SHIFT; + + current->mm->total_vm -= old_size; + current->mm->locked_vm -= old_size; + + if (size == 0) + goto out; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->total_vm; + if (size <= 0) + goto out; + } + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->locked_vm; + if (size <= 0) + goto out; + } + + ret = ds_allocate((void **)&child->thread.ds_area_msr, + size << PAGE_SHIFT); + if (ret < 0) + goto out; + + current->mm->total_vm += size; + current->mm->locked_vm += size; + +out: + if (child->thread.ds_area_msr) + set_tsk_thread_flag(child, TIF_DS_AREA_MSR); + else + clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); + + return ret; +} + +static int ptrace_bts_config(struct task_struct *child, + long cfg_size, + const struct ptrace_bts_config __user *ucfg) +{ + struct ptrace_bts_config cfg; + int bts_size, ret = 0; + void *ds; + + if (cfg_size < sizeof(cfg)) + return -EIO; + + if (copy_from_user(&cfg, ucfg, sizeof(cfg))) + return -EFAULT; + + if ((int)cfg.size < 0) + return -EINVAL; + + bts_size = 0; + ds = (void *)child->thread.ds_area_msr; + if (ds) { + bts_size = ds_get_bts_size(ds); + if (bts_size < 0) + return bts_size; + } + cfg.size = PAGE_ALIGN(cfg.size); + + if (bts_size != cfg.size) { + ret = ptrace_bts_realloc(child, cfg.size, + cfg.flags & PTRACE_BTS_O_CUT_SIZE); + if (ret < 0) + goto errout; + + ds = (void *)child->thread.ds_area_msr; + } + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) + ret = ds_set_overflow(ds, DS_O_SIGNAL); + else + ret = ds_set_overflow(ds, DS_O_WRAP); + if (ret < 0) + goto errout; + + if (cfg.flags & PTRACE_BTS_O_TRACE) + child->thread.debugctlmsr |= ds_debugctl_mask(); + else + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + + if (cfg.flags & PTRACE_BTS_O_SCHED) + set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + else + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + + ret = sizeof(cfg); + +out: + if (child->thread.debugctlmsr) + set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + return ret; + +errout: + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + goto out; +} + +static int ptrace_bts_status(struct task_struct *child, + long cfg_size, + struct ptrace_bts_config __user *ucfg) +{ + void *ds = (void *)child->thread.ds_area_msr; + struct ptrace_bts_config cfg; + + if (cfg_size < sizeof(cfg)) + return -EIO; + + memset(&cfg, 0, sizeof(cfg)); + + if (ds) { + cfg.size = ds_get_bts_size(ds); + + if (ds_get_overflow(ds) == DS_O_SIGNAL) + cfg.flags |= PTRACE_BTS_O_SIGNAL; + + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & ds_debugctl_mask()) + cfg.flags |= PTRACE_BTS_O_TRACE; + + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; + } + + cfg.bts_size = sizeof(struct bts_struct); + + if (copy_to_user(ucfg, &cfg, sizeof(cfg))) + return -EFAULT; + + return sizeof(cfg); +} + +void ptrace_bts_take_timestamp(struct task_struct *tsk, + enum bts_qualifier qualifier) +{ + struct bts_struct rec = { + .qualifier = qualifier, + .variant.jiffies = jiffies_64 + }; + + ptrace_bts_write_record(tsk, &rec); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif + if (child->thread.ds_area_msr) { + ptrace_bts_realloc(child, 0, 0); + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset_view user_x86_32_view; /* Initialized below. */ +#endif + +long arch_ptrace(struct task_struct *child, long request, long addr, long data) +{ + int ret; + unsigned long __user *datap = (unsigned long __user *)data; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr < 0 || + addr >= sizeof(struct user)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, datap); + break; + } + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr < 0 || + addr >= sizeof(struct user)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + +#ifdef CONFIG_X86_32 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap); +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case PTRACE_GET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + case PTRACE_SET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, + (struct user_desc __user *) data, 0); + break; +#endif + +#ifdef CONFIG_X86_64 + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; +#endif + + case PTRACE_BTS_CONFIG: + ret = ptrace_bts_config + (child, data, (struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_STATUS: + ret = ptrace_bts_status + (child, data, (struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_SIZE: + ret = ptrace_bts_get_size(child); + break; + + case PTRACE_BTS_GET: + ret = ptrace_bts_read_record + (child, data, (struct bts_struct __user *) addr); + break; + + case PTRACE_BTS_CLEAR: + ret = ptrace_bts_clear(child); + break; + + case PTRACE_BTS_DRAIN: + ret = ptrace_bts_drain + (child, data, (struct bts_struct __user *) addr); + break; + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <asm/ia32.h> +#include <asm/user32.h> + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + regs->q = value; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + return set_segment_reg(child, \ + offsetof(struct user_regs_struct, rs), \ + value); \ + break + +static int putreg32(struct task_struct *child, unsigned regno, u32 value) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(cs); + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + SEG32(ss); + + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + return set_flags(child, value); + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + return ptrace_set_debugreg(child, regno / 4, value); + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + *val = regs->q; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + *val = get_segment_reg(child, \ + offsetof(struct user_regs_struct, rs)); \ + break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + + R32(cs, cs); + R32(ss, ss); + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + *val = get_flags(child); + break; + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + *val = ptrace_get_debugreg(child, regno / 4); + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + *val = 0; + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +static int genregs32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + compat_ulong_t *k = kbuf; + while (count > 0) { + getreg32(target, pos, k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + compat_ulong_t __user *u = ubuf; + while (count > 0) { + compat_ulong_t word; + getreg32(target, pos, &word); + if (__put_user(word, u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0 && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !ret) { + compat_ulong_t word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +{ + siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); + compat_siginfo_t __user *si32 = compat_ptr(data); + siginfo_t ssi; + int ret; + + if (request == PTRACE_SETSIGINFO) { + memset(&ssi, 0, sizeof(siginfo_t)); + ret = copy_siginfo_from_user32(&ssi, si32); + if (ret) + return ret; + if (copy_to_user(si, &ssi, sizeof(siginfo_t))) + return -EFAULT; + } + ret = sys_ptrace(request, pid, addr, (unsigned long)si); + if (ret) + return ret; + if (request == PTRACE_GETSIGINFO) { + if (copy_from_user(&ssi, si, sizeof(siginfo_t))) + return -EFAULT; + ret = copy_siginfo_to_user32(si32, &ssi); + } + return ret; +} + +asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) +{ + struct task_struct *child; + struct pt_regs *childregs; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_TRACEME: + case PTRACE_ATTACH: + case PTRACE_KILL: + case PTRACE_CONT: + case PTRACE_SINGLESTEP: + case PTRACE_SINGLEBLOCK: + case PTRACE_DETACH: + case PTRACE_SYSCALL: + case PTRACE_OLDSETOPTIONS: + case PTRACE_SETOPTIONS: + case PTRACE_SET_THREAD_AREA: + case PTRACE_GET_THREAD_AREA: + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: + return sys_ptrace(request, pid, addr, data); + + default: + return -EINVAL; + + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + case PTRACE_POKEDATA: + case PTRACE_POKETEXT: + case PTRACE_POKEUSR: + case PTRACE_PEEKUSR: + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_SETFPREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPXREGS: + case PTRACE_GETFPXREGS: + case PTRACE_GETEVENTMSG: + break; + + case PTRACE_SETSIGINFO: + case PTRACE_GETSIGINFO: + return ptrace32_siginfo(request, pid, addr, data); + } + + child = ptrace_get_task_struct(pid); + if (IS_ERR(child)) + return PTR_ERR(child); + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out; + + childregs = task_pt_regs(child); + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + out: + put_task_struct(child); + return ret; +} + +#endif /* CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_X86_64 + +static const struct user_regset x86_64_regsets[] = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .get = genregs_get, .set = genregs_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, +}; + +static const struct user_regset_view user_x86_64_view = { + .name = "x86_64", .e_machine = EM_X86_64, + .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) +}; + +#else /* CONFIG_X86_32 */ + +#define user_regs_struct32 user_regs_struct +#define genregs32_get genregs_get +#define genregs32_set genregs_set + +#endif /* CONFIG_X86_64 */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset x86_32_regsets[] = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .get = genregs32_get, .set = genregs32_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = fpregs_active, .get = fpregs_get, .set = fpregs_set + }, + [REGSET_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .get = regset_tls_get, .set = regset_tls_set + }, +}; + +static const struct user_regset_view user_x86_32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) +}; +#endif + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + return &user_x86_32_view; +#endif +#ifdef CONFIG_X86_64 + return &user_x86_64_view; +#endif +} + +#ifdef CONFIG_X86_32 + +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +{ + struct siginfo info; + + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGTRAP; + info.si_code = TRAP_BRKPT; + + /* User-mode ip? */ + info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; + + /* Send us the fake SIGTRAP */ + force_sig_info(SIGTRAP, &info, tsk); +} + +/* notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +__attribute__((regparm(3))) +int do_syscall_trace(struct pt_regs *regs, int entryexit) +{ + int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); + /* + * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall + * interception + */ + int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); + int ret = 0; + + /* do the secure computing check first */ + if (!entryexit) + secure_computing(regs->orig_ax); + + if (unlikely(current->audit_context)) { + if (entryexit) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), + regs->ax); + /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only + * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is + * not used, entry.S will call us only on syscall exit, not + * entry; so when TIF_SYSCALL_AUDIT is used we must avoid + * calling send_sigtrap() on syscall entry. + * + * Note that when PTRACE_SYSEMU_SINGLESTEP is used, + * is_singlestep is false, despite his name, so we will still do + * the correct thing. + */ + else if (is_singlestep) + goto out; + } + + if (!(current->ptrace & PT_PTRACED)) + goto out; + + /* If a process stops on the 1st tracepoint with SYSCALL_TRACE + * and then is resumed with SYSEMU_SINGLESTEP, it will come in + * here. We have to check this and return */ + if (is_sysemu && entryexit) + return 0; + + /* Fake a debug trap */ + if (is_singlestep) + send_sigtrap(current, regs, 0); + + if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) + goto out; + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + ret = is_sysemu; +out: + if (unlikely(current->audit_context) && !entryexit) + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, + regs->bx, regs->cx, regs->dx, regs->si); + if (ret == 0) + return 0; + + regs->orig_ax = -1; /* force skip of syscall restarting */ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + return 1; +} + +#else /* CONFIG_X86_64 */ + +static void syscall_trace(struct pt_regs *regs) +{ + +#if 0 + printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", + current->comm, + regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), + current_thread_info()->flags, current->ptrace); +#endif + + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_ax); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); + + if (unlikely(current->audit_context)) { + if (test_thread_flag(TIF_IA32)) { + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); + } else { + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); + } + } +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + + if ((test_thread_flag(TIF_SYSCALL_TRACE) + || test_thread_flag(TIF_SINGLESTEP)) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c deleted file mode 100644 index ff5431cc03ee..000000000000 --- a/arch/x86/kernel/ptrace_32.c +++ /dev/null @@ -1,717 +0,0 @@ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9). - * Also masks reserved bits (31-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x00050dd5 - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100 - -/* - * Offset of eflags on child stack.. - */ -#define EFL_OFFSET offsetof(struct pt_regs, eflags) - -static inline struct pt_regs *get_child_regs(struct task_struct *task) -{ - void *stack_top = (void *)task->thread.esp0; - return stack_top - sizeof(struct pt_regs); -} - -/* - * This routine will get a word off of the processes privileged stack. - * the offset is bytes into the pt_regs structure on the stack. - * This routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); - stack += offset; - return (*((int *)stack)); -} - -/* - * This routine will put a word on the processes privileged stack. - * the offset is bytes into the pt_regs structure on the stack. - * This routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - switch (regno >> 2) { - case GS: - if (value && (value & 3) != 3) - return -EIO; - child->thread.gs = value; - return 0; - case DS: - case ES: - case FS: - if (value && (value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case SS: - case CS: - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case EFL: - value &= FLAG_MASK; - value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; - break; - } - if (regno > FS*4) - regno -= 1*4; - put_stack_long(child, regno, value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, - unsigned long regno) -{ - unsigned long retval = ~0UL; - - switch (regno >> 2) { - case GS: - retval = child->thread.gs; - break; - case DS: - case ES: - case FS: - case SS: - case CS: - retval = 0xffff; - /* fall through */ - default: - if (regno > FS*4) - regno -= 1*4; - retval &= get_stack_long(child, regno); - } - return retval; -} - -#define LDT_SEGMENT 4 - -static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->eip; - seg = regs->xcs & 0xffff; - if (regs->eflags & VM_MASK) { - addr = (addr & 0xffff) + (seg << 4); - return addr; - } - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - mutex_unlock(&child->mm->context.lock); - } - return addr; -} - -static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_eip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: - continue; - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = get_child_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = get_child_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -} - -/* - * Perform get_thread_area on behalf of the traced child. - */ -static int -ptrace_get_thread_area(struct task_struct *child, - int idx, struct user_desc __user *user_desc) -{ - struct user_desc info; - struct desc_struct *desc; - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(user_desc, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - -/* - * Perform set_thread_area on behalf of the traced child. - */ -static int -ptrace_set_thread_area(struct task_struct *child, - int idx, struct user_desc __user *user_desc) -{ - struct user_desc info; - struct desc_struct *desc; - - if (copy_from_user(&info, user_desc, sizeof(info))) - return -EFAULT; - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - - return 0; -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - struct user * dummy = NULL; - int i, ret; - unsigned long __user *datap = (unsigned long __user *)data; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 3) || addr < 0 || - addr > sizeof(struct user) - 3) - break; - - tmp = 0; /* Default return condition */ - if(addr < FRAME_SIZE*sizeof(long)) - tmp = getreg(child, addr); - if(addr >= (long) &dummy->u_debugreg[0] && - addr <= (long) &dummy->u_debugreg[7]){ - addr -= (long) &dummy->u_debugreg[0]; - addr = addr >> 2; - tmp = child->thread.debugreg[addr]; - } - ret = put_user(tmp, datap); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - ret = -EIO; - if ((addr & 3) || addr < 0 || - addr > sizeof(struct user) - 3) - break; - - if (addr < FRAME_SIZE*sizeof(long)) { - ret = putreg(child, addr, data); - break; - } - /* We need to be very careful here. We implicitly - want to modify a portion of the task_struct, and we - have to be selective about what portions we allow someone - to modify. */ - - ret = -EIO; - if(addr >= (long) &dummy->u_debugreg[0] && - addr <= (long) &dummy->u_debugreg[7]){ - - if(addr == (long) &dummy->u_debugreg[4]) break; - if(addr == (long) &dummy->u_debugreg[5]) break; - if(addr < (long) &dummy->u_debugreg[4] && - ((unsigned long) data) >= TASK_SIZE-3) break; - - /* Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System - * Programming)*/ - - if(addr == (long) &dummy->u_debugreg[7]) { - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - goto out_tsk; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - } - addr -= (long) &dummy->u_debugreg; - addr = addr >> 2; - child->thread.debugreg[addr] = data; - ret = 0; - } - break; - - case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSEMU) { - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } else if (request == PTRACE_SYSCALL) { - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - } else { - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */ - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - - if (request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { - ret = -EIO; - break; - } - for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { - __put_user(getreg(child, i), datap); - datap++; - } - ret = 0; - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) { - ret = -EIO; - break; - } - for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { - __get_user(tmp, datap); - putreg(child, i, tmp); - datap++; - } - ret = 0; - break; - } - - case PTRACE_GETFPREGS: { /* Get the child FPU state. */ - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = 0; - if (!tsk_used_math(child)) - init_fpu(child); - get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child FPU state. */ - if (!access_ok(VERIFY_READ, datap, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - set_fpregs(child, (struct user_i387_struct __user *)data); - ret = 0; - break; - } - - case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_fxsr_struct))) { - ret = -EIO; - break; - } - if (!tsk_used_math(child)) - init_fpu(child); - ret = get_fpxregs((struct user_fxsr_struct __user *)data, child); - break; - } - - case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, datap, - sizeof(struct user_fxsr_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data); - break; - } - - case PTRACE_GET_THREAD_AREA: - ret = ptrace_get_thread_area(child, addr, - (struct user_desc __user *) data); - break; - - case PTRACE_SET_THREAD_AREA: - ret = ptrace_set_thread_area(child, addr, - (struct user_desc __user *) data); - break; - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - out_tsk: - return ret; -} - -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) -{ - struct siginfo info; - - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; - - /* User-mode eip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; - - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); -} - -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -__attribute__((regparm(3))) -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_eax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->eax), - regs->eax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax, - regs->ebx, regs->ecx, regs->edx, regs->esi); - if (ret == 0) - return 0; - - regs->orig_eax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); - return 1; -} diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c deleted file mode 100644 index 607085f3f08a..000000000000 --- a/arch/x86/kernel/ptrace_64.c +++ /dev/null @@ -1,621 +0,0 @@ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 port 2000-2002 Andi Kleen - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (63-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100UL - -/* - * eflags and offset of eflags on child stack.. - */ -#define EFLAGS offsetof(struct pt_regs, eflags) -#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline unsigned long get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.rsp0; - stack += offset; - return (*((unsigned long *)stack)); -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *) task->thread.rsp0; - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -#define LDT_SEGMENT 4 - -unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->rip; - seg = regs->cs & 0xffff; - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - mutex_unlock(&child->mm->context.lock); - } - - return addr; -} - -static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_rip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - - /* CHECKME: 64 65 */ - - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf2: case 0xf3: - continue; - - case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) - /* 32-bit mode: register increment */ - return 0; - /* 64-bit mode: REX prefix */ - continue; - - /* CHECKME: f2, f3 */ - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = task_pt_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = task_pt_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - unsigned long tmp; - - switch (regno) { - case offsetof(struct user_regs_struct,fs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.fsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,gs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.gsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ds): - if (value && (value & 3) != 3) - return -EIO; - child->thread.ds = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,es): - if (value && (value & 3) != 3) - return -EIO; - child->thread.es = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ss): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): - value &= FLAG_MASK; - tmp = get_stack_long(child, EFL_OFFSET); - tmp &= ~FLAG_MASK; - value |= tmp; - break; - case offsetof(struct user_regs_struct,cs): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, unsigned long regno) -{ - unsigned long val; - switch (regno) { - case offsetof(struct user_regs_struct, fs): - return child->thread.fsindex; - case offsetof(struct user_regs_struct, gs): - return child->thread.gsindex; - case offsetof(struct user_regs_struct, ds): - return child->thread.ds; - case offsetof(struct user_regs_struct, es): - return child->thread.es; - case offsetof(struct user_regs_struct, fs_base): - return child->thread.fs; - case offsetof(struct user_regs_struct, gs_base): - return child->thread.gs; - default: - regno = regno - sizeof(struct pt_regs); - val = get_stack_long(child, regno); - if (test_tsk_thread_flag(child, TIF_IA32)) - val &= 0xffffffff; - return val; - } - -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - long i, ret; - unsigned ui; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): - tmp = child->thread.debugreg0; - break; - case offsetof(struct user, u_debugreg[1]): - tmp = child->thread.debugreg1; - break; - case offsetof(struct user, u_debugreg[2]): - tmp = child->thread.debugreg2; - break; - case offsetof(struct user, u_debugreg[3]): - tmp = child->thread.debugreg3; - break; - case offsetof(struct user, u_debugreg[6]): - tmp = child->thread.debugreg6; - break; - case offsetof(struct user, u_debugreg[7]): - tmp = child->thread.debugreg7; - break; - default: - tmp = 0; - break; - } - ret = put_user(tmp,(unsigned long __user *) data); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - { - int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ - case offsetof(struct user, u_debugreg[0]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg0 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[1]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg1 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[2]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg2 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[3]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg3 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[6]): - if (data >> 32) - break; - child->thread.debugreg6 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[7]): - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - break; - if (i == 4) { - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - ret = 0; - } - break; - } - break; - } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -#ifdef CONFIG_IA32_EMULATION - /* This makes only sense with 32bit programs. Allow a - 64bit debugger to fully examine them too. Better - don't use it against 64bit processes, use - PTRACE_ARCH_PRCTL instead. */ - case PTRACE_SET_THREAD_AREA: { - struct user_desc __user *p; - int old; - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_set_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - case PTRACE_GET_THREAD_AREA: - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_get_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - } -#endif - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); - data += sizeof(long); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret = __get_user(tmp, (unsigned long __user *) data); - if (ret) - break; - ret = putreg(child, ui, tmp); - if (ret) - break; - data += sizeof(long); - } - break; - } - - case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpregs(child, (struct user_i387_struct __user *)data); - break; - } - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - return ret; -} - -static void syscall_trace(struct pt_regs *regs) -{ - -#if 0 - printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} - -asmlinkage void syscall_trace_enter(struct pt_regs *regs) -{ - /* do the secure computing check first */ - secure_computing(regs->orig_rax); - - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); - - if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_rax, - regs->rbx, regs->rcx, - regs->rdx, regs->rsi); - } else { - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_rax, - regs->rdi, regs->rsi, - regs->rdx, regs->r10); - } - } -} - -asmlinkage void syscall_trace_leave(struct pt_regs *regs) -{ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); - - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index fab30e134836..150ba29a0d33 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -162,6 +162,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); static struct pci_dev *cached_dev; diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c index bb1a0f889c5e..5818dc28167d 100644 --- a/arch/x86/kernel/reboot_32.c +++ b/arch/x86/kernel/reboot.c @@ -1,64 +1,94 @@ -#include <linux/mm.h> #include <linux/module.h> -#include <linux/delay.h> #include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/efi.h> -#include <linux/dmi.h> -#include <linux/ctype.h> -#include <linux/pm.h> #include <linux/reboot.h> -#include <asm/uaccess.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/efi.h> +#include <acpi/reboot.h> +#include <asm/io.h> #include <asm/apic.h> -#include <asm/hpet.h> #include <asm/desc.h> -#include "mach_reboot.h" +#include <asm/hpet.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> +#ifdef CONFIG_X86_32 +# include <linux/dmi.h> +# include <linux/ctype.h> +# include <linux/mc146818rtc.h> +# include <asm/pgtable.h> +#else +# include <asm/iommu.h> +#endif + /* * Power off function, if any */ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); +static long no_idt[3]; static int reboot_mode; -static int reboot_thru_bios; +enum reboot_type reboot_type = BOOT_KBD; +int reboot_force; -#ifdef CONFIG_SMP +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] + warm Don't set the cold reboot flag + cold Set the cold reboot flag + bios Reboot by jumping through the BIOS (only for X86_32) + smp Reboot by executing reset on BSP or other CPU (only for X86_32) + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + acpi Use the RESET_REG in the FADT + efi Use efi reset_system runtime service + force Avoid anything that could hang. + */ static int __init reboot_setup(char *str) { - while(1) { + for (;;) { switch (*str) { - case 'w': /* "warm" reboot (no memory testing etc) */ + case 'w': reboot_mode = 0x1234; break; - case 'c': /* "cold" reboot (with memory testing etc) */ - reboot_mode = 0x0; - break; - case 'b': /* "bios" reboot by jumping through the BIOS */ - reboot_thru_bios = 1; - break; - case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */ - reboot_thru_bios = 0; + + case 'c': + reboot_mode = 0; break; + +#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP - case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ + case 's': if (isdigit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (isdigit(*(str+2))) reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); } - /* we will leave sorting out the final value - when we are ready to reboot, since we might not - have set up boot_cpu_id or smp_num_cpu */ + /* we will leave sorting out the final value + when we are ready to reboot, since we might not + have set up boot_cpu_id or smp_num_cpu */ break; +#endif /* CONFIG_SMP */ + + case 'b': #endif + case 'a': + case 'k': + case 't': + case 'e': + reboot_type = *str; + break; + + case 'f': + reboot_force = 1; + break; } - if((str = strchr(str,',')) != NULL) + + str = strchr(str, ','); + if (str) str++; else break; @@ -68,18 +98,21 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); + +#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) */ /* - * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. + * Some machines require the "reboot=b" commandline option, + * this quirk makes that automatic. */ static int __init set_bios_reboot(const struct dmi_system_id *d) { - if (!reboot_thru_bios) { - reboot_thru_bios = 1; + if (reboot_type != BOOT_BIOS) { + reboot_type = BOOT_BIOS; printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); } return 0; @@ -143,7 +176,6 @@ static int __init reboot_init(void) dmi_check_system(reboot_dmi_table); return 0; } - core_initcall(reboot_init); /* The following code and data reboots the machine by switching to real @@ -152,7 +184,6 @@ core_initcall(reboot_init); controller to pulse the CPU reset line, which is more thorough, but doesn't work with at least one type of 486 motherboard. It is easy to stop this code working; hence the copious comments. */ - static unsigned long long real_mode_gdt_entries [3] = { @@ -161,11 +192,9 @@ real_mode_gdt_entries [3] = 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct Xgt_desc_struct +static struct desc_ptr real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }, -no_idt = { 0, 0 }; - +real_mode_idt = { 0x3ff, 0 }; /* This is 16-bit protected mode code to disable paging and the cache, switch to real mode and jump to the BIOS reset code. @@ -185,7 +214,6 @@ no_idt = { 0, 0 }; More could be done here to set up the registers as if a CPU reset had occurred; hopefully real BIOSs don't assume much. */ - static unsigned char real_mode_switch [] = { 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ @@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length) `outb_p' is needed instead of just `outb'. Use it to be on the safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ - spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); spin_unlock(&rtc_lock); @@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length) /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); /* * Use `swapper_pg_dir' as our page directory. @@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length) boot)". This seems like a fairly standard thing that gets set by REBOOT.COM programs, and the previous reset routine did this too. */ - *((unsigned short *)0x472) = reboot_mode; /* For the switch to real mode, copy some code to low memory. It has @@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length) has to have the same physical and virtual address, because it turns off paging. Copy it near the end of the first page, out of the way of BIOS variables. */ - - memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), + memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), real_mode_switch, sizeof (real_mode_switch)); - memcpy ((void *) (0x1000 - 100), code, length); + memcpy((void *)(0x1000 - 100), code, length); /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); /* Set up a GDT from which we can load segment descriptors for real mode. The GDT is not used in real mode; it is just needed here to prepare the descriptors. */ - load_gdt(&real_mode_gdt); /* Load the data segment registers, and thus the descriptors ready for @@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length) selector value being loaded here. This is so that the segment registers don't have to be reloaded after switching to real mode: the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" @@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length) /* Jump to the 16-bit code that we copied earlier. It disables paging and the cache, switches to real mode, and jumps to the BIOS reset entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" : - : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); + : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -static void native_machine_shutdown(void) +#endif /* CONFIG_X86_32 */ + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +void machine_emergency_restart(void) +{ + int i; + + /* Tell the BIOS if we want cold or warm reboot */ + *((unsigned short *)__va(0x472)) = reboot_mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_KBD: + for (i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0xfe, 0x64); /* pulse reset low */ + udelay(50); + } + + case BOOT_TRIPLE: + load_idt((const struct desc_ptr *)&no_idt); + __asm__ __volatile__("int3"); + + reboot_type = BOOT_KBD; + break; + +#ifdef CONFIG_X86_32 + case BOOT_BIOS: + machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + + reboot_type = BOOT_KBD; + break; +#endif + + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + + + case BOOT_EFI: + if (efi_enabled) + efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + EFI_SUCCESS, 0, NULL); + + reboot_type = BOOT_KBD; + break; + } + } +} + +void machine_shutdown(void) { + /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ reboot_cpu_id = 0; +#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_isset(reboot_cpu, cpu_online_map)) { + cpu_isset(reboot_cpu, cpu_online_map)) reboot_cpu_id = reboot_cpu; - } +#endif - /* Make certain the cpu I'm rebooting on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) reboot_cpu_id = smp_processor_id(); - } /* Make certain I only run on the appropriate processor */ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - /* O.K. Now that I'm on the appropriate processor, stop - * all of the others, and disable their local APICs. + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. */ - smp_send_stop(); -#endif /* CONFIG_SMP */ +#endif lapic_shutdown(); #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif + #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif -} -void __attribute__((weak)) mach_reboot_fixups(void) -{ +#ifdef CONFIG_X86_64 + pci_iommu_shutdown(); +#endif } -static void native_machine_emergency_restart(void) +void machine_restart(char *__unused) { - if (!reboot_thru_bios) { - if (efi_enabled) { - efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - load_idt(&no_idt); - __asm__ __volatile__("int3"); - } - /* rebooting needs to touch the page at absolute addr 0 */ - *((unsigned short *)__va(0x472)) = reboot_mode; - for (;;) { - mach_reboot_fixups(); /* for board specific fixups */ - mach_reboot(); - /* That didn't work - force a triple fault.. */ - load_idt(&no_idt); - __asm__ __volatile__("int3"); - } - } - if (efi_enabled) - efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL); + printk("machine restart\n"); - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); -} - -static void native_machine_restart(char * __unused) -{ - machine_shutdown(); + if (!reboot_force) + machine_shutdown(); machine_emergency_restart(); } -static void native_machine_halt(void) +void machine_halt(void) { } -static void native_machine_power_off(void) +void machine_power_off(void) { if (pm_power_off) { - machine_shutdown(); + if (!reboot_force) + machine_shutdown(); pm_power_off(); } } - struct machine_ops machine_ops = { - .power_off = native_machine_power_off, - .shutdown = native_machine_shutdown, - .emergency_restart = native_machine_emergency_restart, - .restart = native_machine_restart, - .halt = native_machine_halt, + .power_off = machine_power_off, + .shutdown = machine_shutdown, + .emergency_restart = machine_emergency_restart, + .restart = machine_restart, + .halt = machine_halt }; - -void machine_power_off(void) -{ - machine_ops.power_off(); -} - -void machine_shutdown(void) -{ - machine_ops.shutdown(); -} - -void machine_emergency_restart(void) -{ - machine_ops.emergency_restart(); -} - -void machine_restart(char *cmd) -{ - machine_ops.restart(cmd); -} - -void machine_halt(void) -{ - machine_ops.halt(); -} diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c deleted file mode 100644 index 53620a92a8fd..000000000000 --- a/arch/x86/kernel/reboot_64.c +++ /dev/null @@ -1,176 +0,0 @@ -/* Various gunk just to reboot the machine. */ -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <linux/pm.h> -#include <linux/kdebug.h> -#include <linux/sched.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/hw_irq.h> -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/apic.h> -#include <asm/hpet.h> -#include <asm/gart.h> - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -static long no_idt[3]; -static enum { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k' -} reboot_type = BOOT_KBD; -static int reboot_mode = 0; -int reboot_force; - -/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - - case 't': - case 'b': - case 'k': - reboot_type = *str; - break; - case 'f': - reboot_force = 1; - break; - } - if((str = strchr(str,',')) != NULL) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - -static inline void kb_wait(void) -{ - int i; - - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; -} - -void machine_shutdown(void) -{ - unsigned long flags; - - /* Stop the cpus and apics */ -#ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); - } - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. - */ - smp_send_stop(); -#endif - - local_irq_save(flags); - -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - - disable_IO_APIC(); - -#ifdef CONFIG_HPET_TIMER - hpet_disable(); -#endif - local_irq_restore(flags); - - pci_iommu_shutdown(); -} - -void machine_emergency_restart(void) -{ - int i; - - /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; - - for (;;) { - /* Could also try the reset bit in the Hammer NB */ - switch (reboot_type) { - case BOOT_KBD: - for (i=0; i<10; i++) { - kb_wait(); - udelay(50); - outb(0xfe,0x64); /* pulse reset low */ - udelay(50); - } - - case BOOT_TRIPLE: - load_idt((const struct desc_ptr *)&no_idt); - __asm__ __volatile__("int3"); - - reboot_type = BOOT_KBD; - break; - } - } -} - -void machine_restart(char * __unused) -{ - printk("machine restart\n"); - - if (!reboot_force) { - machine_shutdown(); - } - machine_emergency_restart(); -} - -void machine_halt(void) -{ -} - -void machine_power_off(void) -{ - if (pm_power_off) { - if (!reboot_force) { - machine_shutdown(); - } - pm_power_off(); - } -} - diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index f452726c0fe2..dec0b5ec25c2 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev) udelay(50); /* shouldn't get here but be safe and spin a while */ } +static void rdc321x_reset(struct pci_dev *dev) +{ + unsigned i; + /* Voluntary reset the watchdog timer */ + outl(0x80003840, 0xCF8); + /* Generate a CPU reset on next tick */ + i = inl(0xCFC); + /* Use the minimum timer resolution */ + i |= 0x1600; + outl(i, 0xCFC); + outb(1, 0x92); +} + struct device_fixup { unsigned int vendor; unsigned int device; @@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, }; /* diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c new file mode 100644 index 000000000000..eb9b1a198f5e --- /dev/null +++ b/arch/x86/kernel/rtc.c @@ -0,0 +1,204 @@ +/* + * RTC related functions + */ +#include <linux/acpi.h> +#include <linux/bcd.h> +#include <linux/mc146818rtc.h> + +#include <asm/time.h> +#include <asm/vsyscall.h> + +#ifdef CONFIG_X86_32 +# define CMOS_YEARS_OFFS 1900 +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); +#else +/* + * x86-64 systems only exists since 2002. + * This will work up to Dec 31, 2100 + */ +# define CMOS_YEARS_OFFS 2000 +#endif + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +/* + * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * called 500 ms after the second nowtime has started, because when + * nowtime is written into the registers of the CMOS clock, it will + * jump to the next second precisely 500 ms later. Check the Motorola + * MC146818A or Dallas DS12887 data sheet for details. + * + * BUG: This routine does not handle hour overflow properly; it just + * sets the minutes. Usually you'll only notice that after reboot! + */ +int mach_set_rtc_mmss(unsigned long nowtime) +{ + int retval = 0; + int real_seconds, real_minutes, cmos_minutes; + unsigned char save_control, save_freq_select; + + /* tell the clock it's being set */ + save_control = CMOS_READ(RTC_CONTROL); + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); + + /* stop and reset prescaler */ + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + + cmos_minutes = CMOS_READ(RTC_MINUTES); + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + BCD_TO_BIN(cmos_minutes); + + /* + * since we're only adjusting minutes and seconds, + * don't interfere with hour overflow. This avoids + * messing with unknown time zones but requires your + * RTC not to be off by more than 15 minutes + */ + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + /* correct for half hour time zone */ + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + + if (abs(real_minutes - cmos_minutes) < 30) { + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { + BIN_TO_BCD(real_seconds); + BIN_TO_BCD(real_minutes); + } + CMOS_WRITE(real_seconds,RTC_SECONDS); + CMOS_WRITE(real_minutes,RTC_MINUTES); + } else { + printk(KERN_WARNING + "set_rtc_mmss: can't update from %d to %d\n", + cmos_minutes, real_minutes); + retval = -1; + } + + /* The following flags have to be released exactly in this order, + * otherwise the DS12887 (popular MC146818A clone with integrated + * battery and quartz) will not reset the oscillator and will not + * update precisely 500 ms later. You won't find this mentioned in + * the Dallas Semiconductor data sheets, but who believes data + * sheets anyway ... -- Markus Kuhn + */ + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + + return retval; +} + +unsigned long mach_get_cmos_time(void) +{ + unsigned int year, mon, day, hour, min, sec, century = 0; + + /* + * If UIP is clear, then we have >= 244 microseconds before + * RTC registers will be updated. Spec sheet says that this + * is the reliable way to read RTC - registers. If UIP is set + * then the register access might be invalid. + */ + while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); + +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64) + /* CHECKME: Is this really 64bit only ??? */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + acpi_gbl_FADT.century) + century = CMOS_READ(acpi_gbl_FADT.century); +#endif + + if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) { + BCD_TO_BIN(sec); + BCD_TO_BIN(min); + BCD_TO_BIN(hour); + BCD_TO_BIN(day); + BCD_TO_BIN(mon); + BCD_TO_BIN(year); + } + + if (century) { + BCD_TO_BIN(century); + year += century * 100; + printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); + } else { + year += CMOS_YEARS_OFFS; + if (year < 1970) + year += 100; + } + + return mktime(year, mon, day, hour, min, sec); +} + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +static int set_rtc_mmss(unsigned long nowtime) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + retval = set_wallclock(nowtime); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +/* not static: needed by APM */ +unsigned long read_persistent_clock(void) +{ + unsigned long retval, flags; + + spin_lock_irqsave(&rtc_lock, flags); + retval = get_wallclock(); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} + +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 3558ac78c926..309366f8f603 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -24,7 +24,11 @@ #include <asm/sections.h> #include <asm/setup.h> +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; @@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + static int do_not_nx __cpuinitdata = 0; /* noexec=on|off @@ -80,6 +86,43 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + if (per_cpu_offset(cpu)) { +#endif + per_cpu(x86_cpu_to_apicid, cpu) = + x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_bios_cpu_apicid, cpu) = + x86_bios_cpu_apicid_init[cpu]; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + x86_cpu_to_node_map_init[cpu]; +#endif +#ifdef CONFIG_SMP + } + else + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n", + cpu); +#endif + } + + /* indicate the early static arrays will soon be gone */ + x86_cpu_to_apicid_early_ptr = NULL; + x86_bios_cpu_apicid_early_ptr = NULL; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = NULL; +#endif +} + +/* * Great future plan: * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. * Always point %gs to its beginning @@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void) for_each_cpu_mask (i, cpu_possible_map) { char *ptr; - if (!NODE_DATA(cpu_to_node(i))) { + if (!NODE_DATA(early_cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); cpu_pda(i)->data_offset = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } + + /* setup percpu data maps early */ + setup_per_cpu_maps(); } void pda_init(int cpu) @@ -169,7 +215,8 @@ void syscall_init(void) #endif /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } void __cpuinit check_efer(void) @@ -227,7 +274,7 @@ void __cpuinit cpu_init (void) * and set up the GDT descriptor: */ if (cpu) - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); cpu_gdt_descr[cpu].size = GDT_SIZE; load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); @@ -257,10 +304,10 @@ void __cpuinit cpu_init (void) v, cpu); } estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9c24b45b513c..62adc5f20be5 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -44,9 +44,12 @@ #include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> +#include <linux/pci.h> +#include <linux/init_ohci1394_dma.h> #include <video/edid.h> +#include <asm/mtrr.h> #include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> @@ -67,14 +70,83 @@ address, and must not be in the .bss segment! */ unsigned long init_pg_tables_end __initdata = ~0UL; -int disable_pse __cpuinitdata = 0; - /* * Machine setup.. */ -extern struct resource code_resource; -extern struct resource data_resource; -extern struct resource bss_resource; +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -116,13 +188,17 @@ extern int root_mountflags; unsigned long saved_videomode; -#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 +#define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -166,8 +242,7 @@ static int __init parse_mem(char *arg) return -EINVAL; if (strcmp(arg, "nopentium") == 0) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; + setup_clear_cpu_cap(X86_FEATURE_PSE); } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to @@ -176,7 +251,7 @@ static int __init parse_mem(char *arg) * trim the existing memory map. */ unsigned long long mem_size; - + mem_size = memparse(arg, &arg); limit_regions(mem_size); user_defined_memmap = 1; @@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void) unsigned int addr; addr = get_bios_ebda(); if (addr) - reserve_bootmem(addr, PAGE_SIZE); + reserve_bootmem(addr, PAGE_SIZE); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void) {} #endif +#ifdef CONFIG_BLK_DEV_INITRD + +static bool do_relocate_initrd = false; + +static void __init reserve_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + + initrd_start = 0; + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + if (ramdisk_end < ramdisk_image) { + printk(KERN_ERR "initrd wraps around end of memory, " + "disabling initrd\n"); + return; + } + if (ramdisk_size >= end_of_lowmem/2) { + printk(KERN_ERR "initrd too large to handle, " + "disabling initrd\n"); + return; + } + if (ramdisk_end <= end_of_lowmem) { + /* All in lowmem, easy case */ + reserve_bootmem(ramdisk_image, ramdisk_size); + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; + return; + } + + /* We need to move the initrd down into lowmem */ + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; + + /* Note: this includes all the lowmem currently occupied by + the initrd, we rely on that fact to keep the data intact. */ + reserve_bootmem(ramdisk_here, ramdisk_size); + initrd_start = ramdisk_here + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + + do_relocate_initrd = true; +} + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +static void __init relocate_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + unsigned long slop, clen, mapaddr; + char *p, *q; + + if (!do_relocate_initrd) + return; + + ramdisk_here = initrd_start - PAGE_OFFSET; + + q = (char *)initrd_start; + + /* Copy any lowmem portion of the initrd */ + if (ramdisk_image < end_of_lowmem) { + clen = end_of_lowmem - ramdisk_image; + p = (char *)__va(ramdisk_image); + memcpy(q, p, clen); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + + /* Copy the highmem portion of the initrd */ + while (ramdisk_size) { + slop = ramdisk_image & ~PAGE_MASK; + clen = ramdisk_size; + if (clen > MAX_MAP_CHUNK-slop) + clen = MAX_MAP_CHUNK-slop; + mapaddr = ramdisk_image & PAGE_MASK; + p = early_ioremap(mapaddr, clen+slop); + memcpy(q, p+slop, clen); + early_iounmap(p, clen+slop); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } +} + +#endif /* CONFIG_BLK_DEV_INITRD */ + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - numa_kva_reserve(); #ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - - if (ramdisk_end <= end_of_lowmem) { - reserve_bootmem(ramdisk_image, ramdisk_size); - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start+ramdisk_size; - } else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; - } - } + reserve_initrd(); #endif + numa_kva_reserve(); reserve_crashkernel(); } @@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); + early_ioremap_init(); - /* - * FIXME: This isn't an official loader_type right - * now but does currently work with elilo. - * If we were configured as an EFI kernel, check to make - * sure that we were loaded correctly from elilo and that - * the system table is valid. If not, then initialize normally. - */ #ifdef CONFIG_EFI - if ((boot_params.hdr.type_of_loader == 0x50) && - boot_params.efi_info.efi_systab) + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL32", 4)) efi_enabled = 1; #endif @@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p) rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif ARCH_SETUP - if (efi_enabled) - efi_init(); - else { - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); - } + + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(memory_setup()); copy_edd(); @@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + if (efi_enabled) + efi_init(); + max_low_pfn = setup_memory(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) + max_low_pfn = setup_memory(); + #ifdef CONFIG_VMI /* * Must be after max_low_pfn is determined, and before kernel @@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p) smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + remapped_pgdat_init(); sparse_init(); zone_sizes_init(); @@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ +#ifdef CONFIG_BLK_DEV_INITRD + relocate_initrd(); +#endif + paravirt_post_allocator_init(); dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); -#endif - if (efi_enabled) - efi_map_memmap(); +#endif #ifdef CONFIG_ACPI /* @@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif -#ifdef CONFIG_PCI early_quirks(); -#endif #ifdef CONFIG_ACPI acpi_boot_init(); @@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + printk(KERN_INFO "Setting up standard PCI resources\n"); + init_iomem_resources(&code_resource, &data_resource, &bss_resource); + + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 30d94d1d5f5f..77fb87bf6e5a 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/root_dev.h> #include <linux/pci.h> +#include <linux/efi.h> #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> @@ -39,10 +40,13 @@ #include <linux/dmi.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> +#include <linux/uaccess.h> +#include <linux/init_ohci1394_dma.h> #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> +#include <asm/vsyscall.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/msr.h> @@ -50,6 +54,7 @@ #include <video/edid.h> #include <asm/e820.h> #include <asm/dma.h> +#include <asm/gart.h> #include <asm/mpspec.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -59,6 +64,15 @@ #include <asm/sections.h> #include <asm/dmi.h> #include <asm/cacheflush.h> +#include <asm/mce.h> +#include <asm/ds.h> +#include <asm/topology.h> + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define ARCH_SETUP +#endif /* * Machine setup.. @@ -67,6 +81,8 @@ struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + unsigned long mmu_cr4_features; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ @@ -76,7 +92,7 @@ unsigned long saved_video_mode; int force_mwait __cpuinitdata; -/* +/* * Early DMI memory */ int dmi_alloc_index; @@ -122,25 +138,27 @@ struct resource standard_io_resources[] = { #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) -struct resource data_resource = { +static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource code_resource = { +static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource bss_resource = { +static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); + #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -166,12 +184,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); + panic("Cannot find bootmem map of size %ld\n", bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); reserve_bootmem(bootmap, bootmap_size); -} +} #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -205,7 +223,8 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + free_mem = + ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; ret = parse_crashkernel(boot_command_line, free_mem, &crash_size, &crash_base); @@ -229,33 +248,21 @@ static inline void __init reserve_crashkernel(void) {} #endif -#define EBDA_ADDR_POINTER 0x40E - -unsigned __initdata ebda_addr; -unsigned __initdata ebda_size; - -static void discover_ebda(void) +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +void __attribute__((weak)) __init memory_setup(void) { - /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); - ebda_addr <<= 4; - - ebda_size = *(unsigned short *)__va(ebda_addr); - - /* Round EBDA up to pages */ - if (ebda_size == 0) - ebda_size = 1; - ebda_size <<= 10; - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); - if (ebda_size > 64*1024) - ebda_size = 64*1024; + machine_specific_memory_setup(); } +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ void __init setup_arch(char **cmdline_p) { + unsigned i; + printk(KERN_INFO "Command line: %s\n", boot_command_line); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -269,7 +276,15 @@ void __init setup_arch(char **cmdline_p) rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif - setup_memory_region(); +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) + efi_enabled = 1; +#endif + + ARCH_SETUP + + memory_setup(); copy_edd(); if (!boot_params.hdr.root_flags) @@ -293,27 +308,47 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + finish_e820_parsing(); + early_gart_iommu_check(); + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(end_pfn)) { + e820_register_active_regions(0, 0, -1UL); + end_pfn = e820_end_of_ram(); + } + num_physpages = end_pfn; check_efer(); - discover_ebda(); - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + if (efi_enabled) + efi_init(); dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_SMP - /* setup to use the static apicid table during kernel startup */ - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; + /* setup to use the early static init tables during kernel startup */ + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; +#endif #endif #ifdef CONFIG_ACPI @@ -340,48 +375,26 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); + numa_initmem_init(0, end_pfn); #else contig_initmem_init(0, end_pfn); #endif - /* Reserve direct mapping */ - reserve_bootmem_generic(table_start << PAGE_SHIFT, - (table_end - table_start) << PAGE_SHIFT); - - /* reserve kernel */ - reserve_bootmem_generic(__pa_symbol(&_text), - __pa_symbol(&_end) - __pa_symbol(&_text)); + early_res_to_bootmem(); +#ifdef CONFIG_ACPI_SLEEP /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. + * Reserve low memory region for sleep support. */ - reserve_bootmem_generic(0, PAGE_SIZE); - - /* reserve ebda region */ - if (ebda_addr) - reserve_bootmem_generic(ebda_addr, ebda_size); -#ifdef CONFIG_NUMA - /* reserve nodemap region */ - if (nodemap_addr) - reserve_bootmem_generic(nodemap_addr, nodemap_size); + acpi_reserve_bootmem(); #endif -#ifdef CONFIG_SMP - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); -#endif + if (efi_enabled) + efi_reserve_bootmem(); -#ifdef CONFIG_ACPI_SLEEP /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif - /* - * Find and reserve possible boot-time SMP configuration: - */ + * Find and reserve possible boot-time SMP configuration: + */ find_smp_config(); #ifdef CONFIG_BLK_DEV_INITRD if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { @@ -395,6 +408,8 @@ void __init setup_arch(char **cmdline_p) initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; } else { + /* Assumes everything on node 0 */ + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", ramdisk_end, end_of_mem); @@ -404,17 +419,10 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); paging_init(); + map_vsyscall(); -#ifdef CONFIG_PCI early_quirks(); -#endif - /* - * set this early, so we dont allocate cpu0 - * if MADT list doesnt list BSP first - * mpparse.c/MP_processor_info() allocates logical cpu numbers. - */ - cpu_set(0, cpu_present_map); #ifdef CONFIG_ACPI /* * Read APIC and some other early information from ACPI tables. @@ -430,25 +438,24 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); init_apic_mappings(); + ioapic_init_mappings(); /* * We trust e820 completely. No explicit ROM probing in memory. - */ - e820_reserve_resources(); + */ + e820_reserve_resources(&code_resource, &data_resource, &bss_resource); e820_mark_nosave_regions(); - { - unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); - } e820_setup_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif @@ -479,9 +486,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " + "D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); /* On K8 L1 TLB is inclusive, so don't count it */ c->x86_tlbsize = 0; } @@ -495,11 +503,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", c->x86_cache_size, ecx & 0xFF); } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } @@ -508,14 +513,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA static int nearby_node(int apicid) { - int i; + int i, node; + for (i = apicid - 1; i >= 0; i--) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -527,7 +533,7 @@ static int nearby_node(int apicid) * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned bits; @@ -536,7 +542,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) int node = 0; unsigned apicid = hard_smp_processor_id(); #endif - unsigned ecx = cpuid_ecx(0x80000008); + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); + /* Convert the APIC ID into the socket ID */ + c->phys_proc_id = phys_pkg_id(bits); + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); c->x86_max_cores = (ecx & 0xff) + 1; @@ -549,37 +602,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) bits++; } - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - c->phys_proc_id = phys_pkg_id(bits); - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); + c->x86_coreid_bits = bits; - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif #endif } @@ -595,8 +619,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ static __cpuinit int amd_apic_timer_broken(void) { - u32 lo, hi; - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + switch (eax & CPUID_XFAM) { case CPUID_XFAM_K8: if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) @@ -614,6 +638,15 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -624,7 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 - * + * * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ @@ -637,35 +670,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - + clear_bit(0*32+31, (unsigned long *)&c->x86_capability); + /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || + level >= 0x0f58)) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); if (c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); level = get_model_name(c); if (!level) { - switch (c->x86) { + switch (c->x86) { case 15: /* Should distinguish Models here, but this is only a fallback anyways. */ strcpy(c->x86_model_id, "Hammer"); - break; - } - } + break; + } + } display_cacheinfo(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); @@ -677,41 +707,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_K8, &c->x86_capability); - - /* RDTSC can be speculated around */ - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_K8); - /* Family 10 doesn't support C states in MWAIT so don't use it */ - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); if (amd_apic_timer_broken()) disable_apic_timer = 1; } -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; cpuid(1, &eax, &ebx, &ecx, &edx); if (!cpu_has(c, X86_FEATURE_HT)) return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { + } else if (smp_num_siblings > 1) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of " + "siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } @@ -721,7 +748,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = smp_num_siblings / c->x86_max_cores; - index_msb = get_count_order(smp_num_siblings) ; + index_msb = get_count_order(smp_num_siblings); core_bits = get_count_order(c->x86_max_cores); @@ -730,8 +757,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) } out: if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); } #endif @@ -773,28 +802,39 @@ static void srat_detect_node(void) #endif } +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; init_intel_cacheinfo(c); - if (c->cpuid_level > 9 ) { + if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } if (cpu_has_ds) { unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) - set_bit(X86_FEATURE_BTS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) - set_bit(X86_FEATURE_PEBS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_PEBS); } + + if (cpu_has_bts) + ds_init_intel(c); + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -811,14 +851,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 == 6) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 15) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - c->x86_max_cores = intel_num_cpu_cores(c); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -835,18 +872,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) c->x86_vendor = X86_VENDOR_UNKNOWN; } -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { - u32 tfms; + u32 tfms, xlvl; c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; @@ -857,6 +888,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; c->x86_max_cores = 1; + c->x86_coreid_bits = 0; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -865,7 +897,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); - + get_cpu_vendor(c); /* Initialize the standard set of capabilities */ @@ -883,7 +915,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (c->x86_capability[0] & (1<<19)) + if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ @@ -893,18 +925,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -925,6 +945,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + c->extended_cpuid_level = cpuid_eax(0x80000000); + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + early_identify_cpu(c); + init_scattered_cpuid_features(c); c->apicid = phys_pkg_id(0); @@ -954,8 +998,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) break; } - select_idle_routine(c); - detect_ht(c); + detect_ht(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -965,32 +1008,56 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] ^= cleared_cpu_caps[i]; + #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + select_idle_routine(c); + if (c != &boot_cpu_data) mtrr_ap_init(); #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif + +} + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; } - +__setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); + printk(KERN_INFO "%s", c->x86_model_id); - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + /* * Get CPU information for use by the procfs. */ @@ -998,9 +1065,9 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - int cpu = 0; + int cpu = 0, i; - /* + /* * These flag bits must match the definitions in <asm/cpufeature.h>. * NULL means this bit is undefined or reserved; either way it doesn't * have meaning as far as Linux is concerned. Note that it's important @@ -1010,10 +1077,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) */ static const char *const x86_cap_flags[] = { /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", /* AMD-defined */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1080,34 +1147,35 @@ static int show_cpuinfo(struct seq_file *m, void *v) cpu = c->cpu_index; #endif - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)cpu, - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + (unsigned)cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + (int)c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { + + if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get((unsigned)cpu); + if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + freq / 1000, (freq % 1000)); } /* Cache size */ - if (c->x86_cache_size >= 0) + if (c->x86_cache_size >= 0) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - + #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); @@ -1116,48 +1184,43 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } -#endif +#endif seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n" + "flags\t\t:", c->cpuid_level); - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) - seq_printf(m, " %s", x86_cap_flags[i]); - } - + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); - if (c->x86_tlbsize > 0) + if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags) && - x86_power_flags[i]) - seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", - x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0]?" ":"", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } } seq_printf(m, "\n\n"); @@ -1184,8 +1247,8 @@ static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { - .start =c_start, +const struct seq_operations cpuinfo_op = { + .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 9bdd83022f5f..caee1f002fed 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -23,6 +23,7 @@ #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/vdso.h> #include "sigframe_32.h" #define DEBUG_SIG 0 @@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } asmlinkage int -sys_sigaltstack(unsigned long ebx) +sys_sigaltstack(unsigned long bx) { /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ - struct pt_regs *regs = (struct pt_regs *)&ebx; - const stack_t __user *uss = (const stack_t __user *)ebx; - stack_t __user *uoss = (stack_t __user *)regs->ecx; + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; - return do_sigaltstack(uss, uoss, regs->esp); + return do_sigaltstack(uss, uoss, regs->sp); } @@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax #define COPY_SEG(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp; } + regs->seg = tmp; } #define COPY_SEG_STRICT(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp|3; } + regs->seg = tmp|3; } #define GET_SEG(seg) \ { unsigned short tmp; \ @@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); - COPY(edi); - COPY(esi); - COPY(ebp); - COPY(esp); - COPY(ebx); - COPY(edx); - COPY(ecx); - COPY(eip); + COPY(di); + COPY(si); + COPY(bp); + COPY(sp); + COPY(bx); + COPY(dx); + COPY(cx); + COPY(ip); COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); { unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_eax = -1; /* disable syscall checks */ + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ } { @@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax } } - err |= __get_user(*peax, &sc->eax); + err |= __get_user(*peax, &sc->ax); return err; badframe: @@ -174,9 +175,9 @@ badframe: asmlinkage int sys_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *) &__unused; - struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); + struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8); sigset_t set; - int eax; + int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->sc, &eax)) + if (restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; - return eax; + return ax; badframe: - if (show_unhandled_signals && printk_ratelimit()) - printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" - " esp:%lx oeax:%lx\n", + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx" + " sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->eip, - regs->esp, regs->orig_eax); + current->comm, task_pid_nr(current), frame, regs->ip, + regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return 0; @@ -211,9 +215,9 @@ badframe: asmlinkage int sys_rt_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *) &__unused; - struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); + struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4); sigset_t set; - int eax; + int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) goto badframe; - return eax; + return ax; badframe: force_sig(SIGSEGV, current); @@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); savesegment(gs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); - err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); + err |= __put_user(regs->es, (unsigned int __user *)&sc->es); + err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); tmp = save_i387(fpstate); if (tmp < 0) @@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, static inline void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) { - unsigned long esp; + unsigned long sp; /* Default to using normal stack */ - esp = regs->esp; + sp = regs->sp; + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) + return (void __user *) -1L; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(esp) == 0) - esp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ - else if ((regs->xss & 0xffff) != __USER_DS && + else if ((regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { - esp = (unsigned long) ka->sa.sa_restorer; + sp = (unsigned long) ka->sa.sa_restorer; } - esp -= frame_size; + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - esp = ((esp + 4) & -16ul) - 4; - return (void __user *) esp; + sp = ((sp + 4) & -16ul) - 4; + return (void __user *) sp; } /* These symbols are defined with the addresses in the vsyscall page. @@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka, } if (current->binfmt->hasvdso) - restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); else - restorer = (void *)&frame->retcode; + restorer = &frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) sig; - regs->edx = (unsigned long) 0; - regs->ecx = (unsigned long) 0; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ax = (unsigned long) sig; + regs->dx = (unsigned long) 0; + regs->cx = (unsigned long) 0; - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; /* * Clear TF when entering the signal handler, but @@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka, * The tracer may want to single-step inside the * handler too. */ - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->esp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, @@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); + restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); /* - * This is movl $,%eax ; int $0x80 + * This is movl $,%ax ; int $0x80 * * WE DO NOT USE IT ANY MORE! It's only left here for historical * reasons and because gdb uses it as a signature to notice @@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) usig; - regs->edx = (unsigned long) &frame->info; - regs->ecx = (unsigned long) &frame->uc; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ax = (unsigned long) usig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; /* * Clear TF when entering the signal handler, but @@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * The tracer may want to single-step inside the * handler too. */ - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if (regs->orig_eax >= 0) { + if (regs->orig_ax >= 0) { /* If so, check system call restarting.. */ - switch (regs->eax) { + switch (regs->ax) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: - regs->eax = -EINTR; + regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->eax = -EINTR; + regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; } } /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so - * that register information in the sigcontext is correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. */ - if (unlikely(regs->eflags & TF_MASK) - && likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) @@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -static void fastcall do_signal(struct pt_regs *regs) +static void do_signal(struct pt_regs *regs) { siginfo_t info; int signr; @@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs) * have been cleared if the watchpoint triggered * inside the kernel. */ - if (unlikely(current->thread.debugreg[7])) - set_debugreg(current->thread.debugreg[7], 7); + if (unlikely(current->thread.debugreg7)) + set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { @@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (regs->orig_eax >= 0) { + if (regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ - switch (regs->eax) { + switch (regs->ax) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->eax = __NR_restart_syscall; - regs->eip -= 2; + regs->ax = __NR_restart_syscall; + regs->ip -= 2; break; } } @@ -651,13 +660,16 @@ void do_notify_resume(struct pt_regs *regs, void *_unused, { /* Pending single-step? */ if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; + regs->flags |= TF_MASK; clear_thread_flag(TIF_SINGLESTEP); } /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs); + + if (thread_info_flags & _TIF_HRTICK_RESCHED) + hrtick_resched(); clear_thread_flag(TIF_IRET); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ab086b0357fc..7347bb14e306 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -39,7 +39,7 @@ asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { - return do_sigaltstack(uss, uoss, regs->rsp); + return do_sigaltstack(uss, uoss, regs->sp); } @@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned #define COPY(x) err |= __get_user(regs->x, &sc->x) - COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); - COPY(rdx); COPY(rcx); COPY(rip); + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); COPY(r8); COPY(r9); COPY(r10); @@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned { unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->orig_ax = -1; /* disable syscall checks */ } { @@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned } } - err |= __get_user(*prax, &sc->rax); + err |= __get_user(*prax, &sc->ax); return err; badframe: @@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; sigset_t set; - unsigned long eax; + unsigned long ax; - frame = (struct rt_sigframe __user *)(regs->rsp - 8); + frame = (struct rt_sigframe __user *)(regs->sp - 8); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { goto badframe; } @@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; #ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); + printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax); #endif - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) goto badframe; - return eax; + return ax; badframe: signal_fault(regs,frame,"sigreturn"); @@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(0, &sc->gs); err |= __put_user(0, &sc->fs); - err |= __put_user(regs->rdi, &sc->rdi); - err |= __put_user(regs->rsi, &sc->rsi); - err |= __put_user(regs->rbp, &sc->rbp); - err |= __put_user(regs->rsp, &sc->rsp); - err |= __put_user(regs->rbx, &sc->rbx); - err |= __put_user(regs->rdx, &sc->rdx); - err |= __put_user(regs->rcx, &sc->rcx); - err |= __put_user(regs->rax, &sc->rax); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); err |= __put_user(regs->r8, &sc->r8); err |= __put_user(regs->r9, &sc->r9); err |= __put_user(regs->r10, &sc->r10); @@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(regs->r15, &sc->r15); err |= __put_user(me->thread.trap_no, &sc->trapno); err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->rip, &sc->rip); - err |= __put_user(regs->eflags, &sc->eflags); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); err |= __put_user(mask, &sc->oldmask); err |= __put_user(me->thread.cr2, &sc->cr2); @@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo static void __user * get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) { - unsigned long rsp; + unsigned long sp; /* Default to using normal stack - redzone*/ - rsp = regs->rsp - 128; + sp = regs->sp - 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } - return (void __user *)round_down(rsp - size, 16); + return (void __user *)round_down(sp - size, 16); } static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); @@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; #ifdef DEBUG_SIG - printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); + printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax); #endif /* Set up registers for signal handler */ - regs->rdi = sig; + regs->di = sig; /* In case the signal handler was declared without prototypes */ - regs->rax = 0; + regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ - regs->rsi = (unsigned long)&frame->info; - regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ka->sa.sa_handler; - regs->rsp = (unsigned long)frame; + regs->sp = (unsigned long)frame; /* Set up the CS register to run signal handlers in 64-bit mode, even if the handler happens to be interrupting 32-bit code. */ @@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, see include/asm-x86_64/uaccess.h for details. */ set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; #ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", + printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n", current->pid, sig, - regs->rip, regs->rsp, regs); + regs->ip, regs->sp, regs); #endif /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { + if ((long)regs->orig_ax >= 0) { /* If so, check system call restarting.. */ - switch (regs->rax) { + switch (regs->ax) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: - regs->rax = -EINTR; + regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->rax = -EINTR; + regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; } } /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF - * flag so that register information in the sigcontext is - * correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. */ - if (unlikely(regs->eflags & TF_MASK)) { - if (likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } - } + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { @@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { + if ((long)regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ - long res = regs->rax; + long res = regs->ax; switch (res) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->rax = test_thread_flag(TIF_IA32) ? + regs->ax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; - regs->rip -= 2; + regs->ip -= 2; break; } } @@ -461,13 +457,13 @@ void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", - thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); + printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n", + thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current)); #endif /* Pending single-step? */ if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; + regs->flags |= X86_EFLAGS_TF; clear_thread_flag(TIF_SINGLESTEP); } @@ -480,14 +476,20 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) do_signal(regs); + + if (thread_info_flags & _TIF_HRTICK_RESCHED) + hrtick_resched(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (show_unhandled_signals && printk_ratelimit()) - printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", - me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, me); } diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index fcaa026eb807..dc0cde9d16fb 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) apic_write_around(APIC_ICR, cfg); } -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector); } @@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { + for_each_possible_cpu(query_cpu) { if (cpu_isset(query_cpu, mask)) { __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); @@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock); * We need to reload %cr3 since the page tables may be going * away from under us.. */ -void leave_mm(unsigned long cpu) +void leave_mm(int cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu) * 2) Leave the mm if we are in the lazy tlb mode. */ -fastcall void smp_invalidate_interrupt(struct pt_regs *regs) +void smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -638,13 +639,13 @@ static void native_smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __get_cpu_var(irq_stat).irq_resched_count++; } -fastcall void smp_call_function_interrupt(struct pt_regs *regs) +void smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id) { int i; - for (i = 0; i < NR_CPUS; i++) { + for_each_possible_cpu(i) { if (per_cpu(x86_cpu_to_apicid, i) == apic_id) return i; } diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c index 03fa6ed559c6..2fd74b06db67 100644 --- a/arch/x86/kernel/smp_64.c +++ b/arch/x86/kernel/smp_64.c @@ -29,7 +29,7 @@ #include <asm/idle.h> /* - * Smarter SMP flushing macros. + * Smarter SMP flushing macros. * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -37,15 +37,15 @@ * * Optimizations Manfred Spraul <manfred@colorfullife.com> * - * More scalable flush, from Andi Kleen + * More scalable flush, from Andi Kleen * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into * the right per cpu variable for the flush data. * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. * In future when interrupts are split into per CPU domains this could be * fixed, at the cost of triggering multiple IPIs in some cases. */ @@ -55,7 +55,6 @@ union smp_flush_state { cpumask_t flush_cpumask; struct mm_struct *flush_mm; unsigned long flush_va; -#define FLUSH_ALL -1ULL spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; @@ -67,16 +66,17 @@ union smp_flush_state { static DEFINE_PER_CPU(union smp_flush_state, flush_state); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ -static inline void leave_mm(int cpu) +void leave_mm(int cpu) { if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -85,25 +85,25 @@ static inline void leave_mm(int cpu) * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 * was in lazy tlb mode. * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. + * Now cpu0 accepts tlb flushes for the new mm. * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. + * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1b) thread switch without mm change * cpu active_mm is correct, cpu0 already handles * flush ipis. * 1b1) set cpu mmu_state to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. * 1b3) if the bit was 0: leave_mm was called, flush the tlb. * 2) switch %%esp, ie current * @@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. */ - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; f = &per_cpu(flush_state, sender); if (!cpu_isset(cpu, f->flush_cpumask)) goto out; - /* + /* * This was a BUG() but until someone can quote me the * line from the intel manual that guarantees an IPI to * multiple CPUs is retried _only_ on the erroring CPUs @@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * * BUG(); */ - + if (f->flush_mm == read_pda(active_mm)) { if (read_pda(mmu_state) == TLBSTATE_OK) { - if (f->flush_va == FLUSH_ALL) + if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(f->flush_va); @@ -166,19 +166,22 @@ out: add_pda(irq_tlb_count, 1); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { int sender; union smp_flush_state *f; + cpumask_t cpumask = *cpumaskp; /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; f = &per_cpu(flush_state, sender); - /* Could avoid this lock when - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - probably not worth checking this for a cache-hot lock. */ + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ spin_lock(&f->tlbstate_lock); f->flush_mm = mm; @@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, int __cpuinit init_smp_flush(void) { int i; + for_each_cpu_mask(i, cpu_possible_map) { spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } - core_initcall(init_smp_flush); - + void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; @@ -221,10 +224,9 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_current_task); void flush_tlb_mm (struct mm_struct * mm) { @@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { @@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); + else + leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) @@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void) * this function sends a 'generic call function' IPI to all other CPU * of the system defined in the mask. */ - -static int -__smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +static int __smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; cpumask_t allbutself; @@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) + int nonatomic, int wait) { /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); + int ret, me = get_cpu(); /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy) */ cpu_clear(smp_processor_id(), cpu_online_map); disable_local_APIC(); - for (;;) + for (;;) halt(); -} +} void smp_send_stop(void) { diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index 4ea80cbe52e5..5787a0c3e296 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; @@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask; DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); -/* - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ +/* which logical CPU number maps to which CPU (physical APIC ID) */ u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; +void *x86_cpu_to_apicid_early_ptr; DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); @@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID]; extern const unsigned char trampoline_data []; extern const unsigned char trampoline_end []; static unsigned char *trampoline_base; -static int trampoline_exec; static void map_cpu_to_logical_apicid(void); @@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void) */ void __init smp_alloc_memory(void) { - trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); /* * Has to be in very low memory so we can execute * real-mode AP code. */ if (__pa(trampoline_base) >= 0x9F000) BUG(); - /* - * Make the SMP trampoline executable: - */ - trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); } /* @@ -405,7 +394,7 @@ static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); enable_8259A_irq(0); } /* @@ -448,38 +437,38 @@ void __devinit initialize_secondary(void) { /* * We don't actually need to load the full TSS, - * basically just the stack pointer and the eip. + * basically just the stack pointer and the ip. */ asm volatile( "movl %0,%%esp\n\t" "jmp *%1" : - :"m" (current->thread.esp),"m" (current->thread.eip)); + :"m" (current->thread.sp),"m" (current->thread.ip)); } /* Static state in head.S used to set up a CPU */ extern struct { - void * esp; + void * sp; unsigned short ss; } stack_start; #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; -EXPORT_SYMBOL(node_2_cpu_mask); +EXPORT_SYMBOL(node_to_cpumask_map); /* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_2_node); +int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; +EXPORT_SYMBOL(cpu_to_node_map); /* set up a mapping between cpu and node. */ static inline void map_cpu_to_node(int cpu, int node) { printk("Mapping cpu %d to node %d\n", cpu, node); - cpu_set(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = node; + cpu_set(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = node; } /* undo a mapping between cpu and node. */ @@ -489,8 +478,8 @@ static inline void unmap_cpu_to_node(int cpu) printk("Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node ++) - cpu_clear(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = 0; + cpu_clear(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = 0; } #else /* !CONFIG_NUMA */ @@ -668,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * target processor state. */ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, - (unsigned long) stack_start.esp); + (unsigned long) stack_start.sp); /* * Run STARTUP IPI loop. @@ -754,7 +743,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu) /* initialize thread_struct. we really want to avoid destroy * idle tread */ - idle->thread.esp = (unsigned long)task_pt_regs(idle); + idle->thread.sp = (unsigned long)task_pt_regs(idle); init_idle(idle, cpu); return idle; } @@ -799,7 +788,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) per_cpu(current_task, cpu) = idle; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); - idle->thread.eip = (unsigned long) start_secondary; + idle->thread.ip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -807,9 +796,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) alternatives_smp_switch(1); /* So we see what's up */ - printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); + printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip); /* Stack for startup_32 can be just as for start_secondary onwards */ - stack_start.esp = (void *) idle->thread.esp; + stack_start.sp = (void *) idle->thread.sp; irq_ctx_init(cpu); @@ -1091,7 +1080,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * Allow the user to impress friends. */ Dprintk("Before bogomips.\n"); - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; printk(KERN_INFO @@ -1122,7 +1111,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * construct cpu_sibling_map, so that we can tell sibling CPUs * efficiently. */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); } @@ -1296,12 +1285,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif zap_low_mappings(); -#ifndef CONFIG_HOTPLUG_CPU - /* - * Disable executability of the SMP trampoline: - */ - set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); -#endif } void __init smp_intr_init(void) diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index aaf4e1291217..cc64b8085c2a 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -65,7 +65,7 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; +DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map); */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); - cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); @@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; * a new thread. Also avoids complicated thread destroy functionality * for idle threads. */ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is + * removed after init for !CONFIG_HOTPLUG_CPU. + */ +static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); +#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) +#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p)) +#else struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; - #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) +#endif + /* * Currently trivial. Write the real->protected mode @@ -212,6 +220,7 @@ void __cpuinit smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); + end_local_APIC_setup(); /* * Get our bogomips. @@ -338,7 +347,7 @@ void __cpuinit start_secondary(void) if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); enable_8259A_irq(0); } @@ -370,7 +379,7 @@ void __cpuinit start_secondary(void) unlock_ipi_call_lock(); - setup_secondary_APIC_clock(); + setup_secondary_clock(); cpu_idle(); } @@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid) unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; int timeout; - unsigned int status; + u32 status; printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk("a previous APIC delivery may have failed\n"); + printk(KERN_CONT + "a previous APIC delivery may have failed\n"); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); @@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk("%08x\n", status); + printk(KERN_CONT "%08x\n", status); break; default: - printk("failed\n"); + printk(KERN_CONT "failed\n"); } } } @@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); @@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) c_idle.idle = get_idle_for_cpu(cpu); if (c_idle.idle) { - c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); init_idle(c_idle.idle, cpu); goto do_rest; @@ -613,8 +623,8 @@ do_rest: start_rip = setup_trampoline(); - init_rsp = c_idle.idle->thread.rsp; - per_cpu(init_tss,cpu).rsp0 = init_rsp; + init_rsp = c_idle.idle->thread.sp; + load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); initial_code = start_secondary; clear_tsk_thread_flag(c_idle.idle, TIF_FORK); @@ -691,7 +701,7 @@ do_rest: } if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */ clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); @@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus) return 0; } -/* - * Copy apicid's found by MP_processor_info from initial array to the per cpu - * data area. The x86_cpu_to_apicid_init array is then expendable and the - * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no - * longer available. - */ -void __init smp_set_apicids(void) +static void __init smp_cpu_index_default(void) { - int cpu; + int i; + struct cpuinfo_x86 *c; - for_each_cpu_mask(cpu, cpu_possible_map) { - if (per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_apicid, cpu) = - x86_cpu_to_apicid_init[cpu]; + for_each_cpu_mask(i, cpu_possible_map) { + c = &cpu_data(i); + /* mark all to hotplug */ + c->cpu_index = NR_CPUS; } - - /* indicate the static array will be going away soon */ - x86_cpu_to_apicid_ptr = NULL; } /* @@ -868,9 +870,9 @@ void __init smp_set_apicids(void) void __init smp_prepare_cpus(unsigned int max_cpus) { nmi_watchdog_default(); + smp_cpu_index_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - smp_set_apicids(); set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { @@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ setup_local_APIC(); + /* + * Enable IO APIC before setting up error vector + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + end_local_APIC_setup(); + if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); @@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - setup_boot_APIC_clock(); + setup_boot_clock(); } /* @@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - cpu_set(me, cpu_online_map); + /* already set me in cpu_online_map in boot_cpu_init() */ cpu_set(me, cpu_callout_map); per_cpu(cpu_state, me) = CPU_ONLINE; } @@ -1016,7 +1025,7 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */ clear_node_cpumask(cpu); } diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c index bbfe85a0f699..8bc38af29aef 100644 --- a/arch/x86/kernel/smpcommon_32.c +++ b/arch/x86/kernel/smpcommon_32.c @@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, - (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + pack_descriptor(&gdt[GDT_ENTRY_PERCPU], __per_cpu_offset[cpu], 0xFFFFF, - 0x80 | DESCTYPE_S | 0x2, 0x8); + 0x2 | DESCTYPE_S, 0x8); + + gdt[GDT_ENTRY_PERCPU].s = 1; per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(cpu_number, cpu) = cpu; diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 2a8713ec0f9a..2bf6903cb444 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; static int num_memory_chunks; /* total number of memory chunks */ static u8 __initdata apicid_to_pxm[MAX_APICID]; -extern void * boot_ioremap(unsigned long, unsigned long); - /* Identify CPU proximity domains */ static void __init parse_cpu_affinity_structure(char *p) { @@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void) } rsdt = (struct acpi_table_rsdt *) - boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); + early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); if (!rsdt) { printk(KERN_WARNING @@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void) for (i = 0; i < tables; i++) { /* Map in header, then map in full table length. */ header = (struct acpi_table_header *) - boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); + early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); if (!header) break; header = (struct acpi_table_header *) - boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); + early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); if (!header) break; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 6fa6cf036c70..02f0f61f5b11 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -22,9 +22,23 @@ static int save_stack_stack(void *data, char *name) return -1; } -static void save_stack_address(void *data, unsigned long addr) +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + struct stack_trace *trace = data; + if (trace->skip > 0) { + trace->skip--; + return; + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = addr; +} + +static void +save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; + if (in_sched_functions(addr)) + return; if (trace->skip > 0) { trace->skip--; return; @@ -40,13 +54,26 @@ static const struct stacktrace_ops save_stack_ops = { .address = save_stack_address, }; +static const struct stacktrace_ops save_stack_ops_nosched = { + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address_nosched, +}; + /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } -EXPORT_SYMBOL(save_stack_trace); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c new file mode 100644 index 000000000000..2ef1a5f8d675 --- /dev/null +++ b/arch/x86/kernel/step.c @@ -0,0 +1,203 @@ +/* + * x86 single-step support code, common to 32-bit and 64-bit. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/ptrace.h> + +unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->ip; + seg = regs->cs & 0xffff; + if (v8086_mode(regs)) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { + u32 *desc; + unsigned long base; + + seg &= ~7UL; + + mutex_lock(&child->mm->context.lock); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } + mutex_unlock(&child->mm->context.lock); + } + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + +#ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; +#endif + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +/* + * Enable single-stepping. Return nonzero if user mode is not using TF itself. + */ +static int enable_single_step(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * If TF was already set, don't do anything else + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + /* Set TF on the kernel stack.. */ + regs->flags |= X86_EFLAGS_TF; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + */ + if (is_setting_trap_flag(child, regs)) + return 0; + + set_tsk_thread_flag(child, TIF_FORCED_TF); + + return 1; +} + +/* + * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running. + */ +static void write_debugctlmsr(struct task_struct *child, unsigned long val) +{ + child->thread.debugctlmsr = val; + + if (child != current) + return; + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +/* + * Enable single or block step. + */ +static void enable_step(struct task_struct *child, bool block) +{ + /* + * Make sure block stepping (BTF) is not enabled unless it should be. + * Note that we don't try to worry about any is_setting_trap_flag() + * instructions after the first when using block stepping. + * So noone should try to use debugger block stepping in a program + * that uses user-mode single stepping itself. + */ + if (enable_single_step(child) && block) { + set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + write_debugctlmsr(child, + child->thread.debugctlmsr | DEBUGCTLMSR_BTF); + } else { + write_debugctlmsr(child, + child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + } +} + +void user_enable_single_step(struct task_struct *child) +{ + enable_step(child, 0); +} + +void user_enable_block_step(struct task_struct *child) +{ + enable_step(child, 1); +} + +void user_disable_single_step(struct task_struct *child) +{ + /* + * Make sure block stepping (BTF) is disabled. + */ + write_debugctlmsr(child, + child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) + task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; +} diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c index 2e5efaaf8800..09199511c256 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/kernel/suspend_64.c @@ -17,9 +17,26 @@ /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; +static void fix_processor_context(void); + struct saved_context saved_context; -void __save_processor_state(struct saved_context *ctxt) +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ +static void __save_processor_state(struct saved_context *ctxt) { kernel_fpu_begin(); @@ -69,7 +86,12 @@ static void do_fpu_end(void) kernel_fpu_end(); } -void __restore_processor_state(struct saved_context *ctxt) +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ +static void __restore_processor_state(struct saved_context *ctxt) { /* * control registers @@ -113,14 +135,14 @@ void restore_processor_state(void) __restore_processor_state(&saved_context); } -void fix_processor_context(void) +static void fix_processor_context(void) { int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; syscall_init(); /* This sets MSR_*STAR and related */ load_TR_desc(); /* This does ltr */ diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S index 72f952103e50..aeb9a4d7681e 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/kernel/suspend_asm_64.S @@ -18,13 +18,13 @@ ENTRY(swsusp_arch_suspend) movq $saved_context, %rax - movq %rsp, pt_regs_rsp(%rax) - movq %rbp, pt_regs_rbp(%rax) - movq %rsi, pt_regs_rsi(%rax) - movq %rdi, pt_regs_rdi(%rax) - movq %rbx, pt_regs_rbx(%rax) - movq %rcx, pt_regs_rcx(%rax) - movq %rdx, pt_regs_rdx(%rax) + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) movq %r8, pt_regs_r8(%rax) movq %r9, pt_regs_r9(%rax) movq %r10, pt_regs_r10(%rax) @@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend) movq %r14, pt_regs_r14(%rax) movq %r15, pt_regs_r15(%rax) pushfq - popq pt_regs_eflags(%rax) + popq pt_regs_flags(%rax) /* save the address of restore_registers */ movq $restore_registers, %rax @@ -115,13 +115,13 @@ ENTRY(restore_registers) /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax - movq pt_regs_rsp(%rax), %rsp - movq pt_regs_rbp(%rax), %rbp - movq pt_regs_rsi(%rax), %rsi - movq pt_regs_rdi(%rax), %rdi - movq pt_regs_rbx(%rax), %rbx - movq pt_regs_rcx(%rax), %rcx - movq pt_regs_rdx(%rax), %rdx + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx movq pt_regs_r8(%rax), %r8 movq pt_regs_r9(%rax), %r9 movq pt_regs_r10(%rax), %r10 @@ -130,7 +130,7 @@ ENTRY(restore_registers) movq pt_regs_r13(%rax), %r13 movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 - pushq pt_regs_eflags(%rax) + pushq pt_regs_flags(%rax) popfq xorq %rax, %rax diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 907942ee6e76..bd802a5e1aa3 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/utsname.h> #include <linux/personality.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/ia32.h> @@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, of playground for now. -AK */ *begin = 0x40000000; *end = 0x80000000; + if (current->flags & PF_RANDOMIZE) { + new_begin = randomize_range(*begin, *begin + 0x02000000, 0); + if (new_begin) + *begin = new_begin; + } } else { *begin = TASK_UNMAPPED_BASE; *end = TASK_SIZE; @@ -143,6 +150,97 @@ full_search: } } + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* for MAP_32BIT mappings we force the legact mmap base */ + if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + goto bottomup; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (addr > len) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + + if (mm->mmap_base < len) + goto bottomup; + + addr = mm->mmap_base-len; + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (!vma || addr+len <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len < vma->vm_start); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + + return addr; +} + + asmlinkage long sys_uname(struct new_utsname __user * name) { int err; diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c deleted file mode 100644 index 5a2d951e2608..000000000000 --- a/arch/x86/kernel/sysenter_32.c +++ /dev/null @@ -1,346 +0,0 @@ -/* - * (C) Copyright 2002 Linus Torvalds - * Portions based on the vdso-randomization code from exec-shield: - * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar - * - * This file contains the needed initializations to support sysenter. - */ - -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/thread_info.h> -#include <linux/sched.h> -#include <linux/gfp.h> -#include <linux/string.h> -#include <linux/elf.h> -#include <linux/mm.h> -#include <linux/err.h> -#include <linux/module.h> - -#include <asm/cpufeature.h> -#include <asm/msr.h> -#include <asm/pgtable.h> -#include <asm/unistd.h> -#include <asm/elf.h> -#include <asm/tlbflush.h> - -enum { - VDSO_DISABLED = 0, - VDSO_ENABLED = 1, - VDSO_COMPAT = 2, -}; - -#ifdef CONFIG_COMPAT_VDSO -#define VDSO_DEFAULT VDSO_COMPAT -#else -#define VDSO_DEFAULT VDSO_ENABLED -#endif - -/* - * Should the kernel map a VDSO page into processes and pass its - * address down to glibc upon exec()? - */ -unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; - -EXPORT_SYMBOL_GPL(vdso_enabled); - -static int __init vdso_setup(char *s) -{ - vdso_enabled = simple_strtoul(s, NULL, 0); - - return 1; -} - -__setup("vdso=", vdso_setup); - -extern asmlinkage void sysenter_entry(void); - -static __init void reloc_symtab(Elf32_Ehdr *ehdr, - unsigned offset, unsigned size) -{ - Elf32_Sym *sym = (void *)ehdr + offset; - unsigned nsym = size / sizeof(*sym); - unsigned i; - - for(i = 0; i < nsym; i++, sym++) { - if (sym->st_shndx == SHN_UNDEF || - sym->st_shndx == SHN_ABS) - continue; /* skip */ - - if (sym->st_shndx > SHN_LORESERVE) { - printk(KERN_INFO "VDSO: unexpected st_shndx %x\n", - sym->st_shndx); - continue; - } - - switch(ELF_ST_TYPE(sym->st_info)) { - case STT_OBJECT: - case STT_FUNC: - case STT_SECTION: - case STT_FILE: - sym->st_value += VDSO_HIGH_BASE; - } - } -} - -static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) -{ - Elf32_Dyn *dyn = (void *)ehdr + offset; - - for(; dyn->d_tag != DT_NULL; dyn++) - switch(dyn->d_tag) { - case DT_PLTGOT: - case DT_HASH: - case DT_STRTAB: - case DT_SYMTAB: - case DT_RELA: - case DT_INIT: - case DT_FINI: - case DT_REL: - case DT_DEBUG: - case DT_JMPREL: - case DT_VERSYM: - case DT_VERDEF: - case DT_VERNEED: - case DT_ADDRRNGLO ... DT_ADDRRNGHI: - /* definitely pointers needing relocation */ - dyn->d_un.d_ptr += VDSO_HIGH_BASE; - break; - - case DT_ENCODING ... OLD_DT_LOOS-1: - case DT_LOOS ... DT_HIOS-1: - /* Tags above DT_ENCODING are pointers if - they're even */ - if (dyn->d_tag >= DT_ENCODING && - (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += VDSO_HIGH_BASE; - break; - - case DT_VERDEFNUM: - case DT_VERNEEDNUM: - case DT_FLAGS_1: - case DT_RELACOUNT: - case DT_RELCOUNT: - case DT_VALRNGLO ... DT_VALRNGHI: - /* definitely not pointers */ - break; - - case OLD_DT_LOOS ... DT_LOOS-1: - case DT_HIOS ... DT_VALRNGLO-1: - default: - if (dyn->d_tag > DT_ENCODING) - printk(KERN_INFO "VDSO: unexpected DT_tag %x\n", - dyn->d_tag); - break; - } -} - -static __init void relocate_vdso(Elf32_Ehdr *ehdr) -{ - Elf32_Phdr *phdr; - Elf32_Shdr *shdr; - int i; - - BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || - !elf_check_arch(ehdr) || - ehdr->e_type != ET_DYN); - - ehdr->e_entry += VDSO_HIGH_BASE; - - /* rebase phdrs */ - phdr = (void *)ehdr + ehdr->e_phoff; - for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += VDSO_HIGH_BASE; - - /* relocate dynamic stuff */ - if (phdr[i].p_type == PT_DYNAMIC) - reloc_dyn(ehdr, phdr[i].p_offset); - } - - /* rebase sections */ - shdr = (void *)ehdr + ehdr->e_shoff; - for(i = 0; i < ehdr->e_shnum; i++) { - if (!(shdr[i].sh_flags & SHF_ALLOC)) - continue; - - shdr[i].sh_addr += VDSO_HIGH_BASE; - - if (shdr[i].sh_type == SHT_SYMTAB || - shdr[i].sh_type == SHT_DYNSYM) - reloc_symtab(ehdr, shdr[i].sh_offset, - shdr[i].sh_size); - } -} - -void enable_sep_cpu(void) -{ - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } - - tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); - put_cpu(); -} - -static struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; - return 0; -} - -/* - * These symbols are defined by vsyscall.o to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vsyscall_int80_start, vsyscall_int80_end; -extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; -static struct page *syscall_pages[1]; - -static void map_compat_vdso(int map) -{ - static int vdso_mapped; - - if (map == vdso_mapped) - return; - - vdso_mapped = map; - - __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, - map ? PAGE_READONLY_EXEC : PAGE_NONE); - - /* flush stray tlbs */ - flush_tlb_all(); -} - -int __init sysenter_setup(void) -{ - void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - const void *vsyscall; - size_t vsyscall_len; - - syscall_pages[0] = virt_to_page(syscall_page); - - gate_vma_init(); - - printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); - - if (!boot_cpu_has(X86_FEATURE_SEP)) { - vsyscall = &vsyscall_int80_start; - vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; - } else { - vsyscall = &vsyscall_sysenter_start; - vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; - } - - memcpy(syscall_page, vsyscall, vsyscall_len); - relocate_vdso(syscall_page); - - return 0; -} - -/* Defined in vsyscall-sysenter.S */ -extern void SYSENTER_RETURN; - -/* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - bool compat; - - down_write(&mm->mmap_sem); - - /* Test compat mode once here, in case someone - changes it via sysctl */ - compat = (vdso_enabled == VDSO_COMPAT); - - map_compat_vdso(compat); - - if (compat) - addr = VDSO_HIGH_BASE; - else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully - * interpretable later without matching up the same - * kernel and hardware config to see what PC values - * meant. - */ - ret = install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall_pages); - - if (ret) - goto up_fail; - } - - current->mm->context.vdso = (void *)addr; - current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); - - up_fail: - up_write(&mm->mmap_sem); - - return ret; -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - return NULL; -} - -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -{ - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ - if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) - return &gate_vma; - return NULL; -} - -int in_gate_area(struct task_struct *task, unsigned long addr) -{ - const struct vm_area_struct *vma = get_gate_vma(task); - - return vma && addr >= vma->vm_start && addr < vma->vm_end; -} - -int in_gate_area_no_task(unsigned long addr) -{ - return 0; -} diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c new file mode 100644 index 000000000000..6d7ef11e7975 --- /dev/null +++ b/arch/x86/kernel/test_nx.c @@ -0,0 +1,176 @@ +/* + * test_nx.c: functional test for NX functionality + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/uaccess.h> + +extern int rodata_test_data; + +/* + * This file checks 4 things: + * 1) Check if the stack is not executable + * 2) Check if kmalloc memory is not executable + * 3) Check if the .rodata section is not executable + * 4) Check if the .data section of a module is not executable + * + * To do this, the test code tries to execute memory in stack/kmalloc/etc, + * and then checks if the expected trap happens. + * + * Sadly, this implies having a dynamic exception handling table entry. + * ... which can be done (and will make Rusty cry)... but it can only + * be done in a stand-alone module with only 1 entry total. + * (otherwise we'd have to sort and that's just too messy) + */ + + + +/* + * We want to set up an exception handling point on our stack, + * which means a variable value. This function is rather dirty + * and walks the exception table of the module, looking for a magic + * marker and replaces it with a specific function. + */ +static void fudze_exception_table(void *marker, void *new) +{ + struct module *mod = THIS_MODULE; + struct exception_table_entry *extable; + + /* + * Note: This module has only 1 exception table entry, + * so searching and sorting is not needed. If that changes, + * this would be the place to search and re-sort the exception + * table. + */ + if (mod->num_exentries > 1) { + printk(KERN_ERR "test_nx: too many exception table entries!\n"); + printk(KERN_ERR "test_nx: test results are not reliable.\n"); + return; + } + extable = (struct exception_table_entry *)mod->extable; + extable[0].insn = (unsigned long)new; +} + + +/* + * exception tables get their symbols translated so we need + * to use a fake function to put in there, which we can then + * replace at runtime. + */ +void foo_label(void); + +/* + * returns 0 for not-executable, negative for executable + * + * Note: we cannot allow this function to be inlined, because + * that would give us more than 1 exception table entry. + * This in turn would break the assumptions above. + */ +static noinline int test_address(void *address) +{ + unsigned long result; + + /* Set up an exception table entry for our address */ + fudze_exception_table(&foo_label, address); + result = 1; + asm volatile( + "foo_label:\n" + "0: call *%[fake_code]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: mov %[zero], %[rslt]\n" + " ret\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 0b\n" + " .quad 2b\n" + ".previous\n" + : [rslt] "=r" (result) + : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) + ); + /* change the exception table back for the next round */ + fudze_exception_table(address, &foo_label); + + if (result) + return -ENODEV; + return 0; +} + +static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */ + +static int test_NX(void) +{ + int ret = 0; + /* 0xC3 is the opcode for "ret" */ + char stackcode[] = {0xC3, 0x90, 0 }; + char *heap; + + test_data = 0xC3; + + printk(KERN_INFO "Testing NX protection\n"); + + /* Test 1: check if the stack is not executable */ + if (test_address(&stackcode)) { + printk(KERN_ERR "test_nx: stack was executable\n"); + ret = -ENODEV; + } + + + /* Test 2: Check if the heap is executable */ + heap = kmalloc(64, GFP_KERNEL); + if (!heap) + return -ENOMEM; + heap[0] = 0xC3; /* opcode for "ret" */ + + if (test_address(heap)) { + printk(KERN_ERR "test_nx: heap was executable\n"); + ret = -ENODEV; + } + kfree(heap); + + /* + * The following 2 tests currently fail, this needs to get fixed + * Until then, don't run them to avoid too many people getting scared + * by the error message + */ +#if 0 + +#ifdef CONFIG_DEBUG_RODATA + /* Test 3: Check if the .rodata section is executable */ + if (rodata_test_data != 0xC3) { + printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); + ret = -ENODEV; + } else if (test_address(&rodata_test_data)) { + printk(KERN_ERR "test_nx: .rodata section is executable\n"); + ret = -ENODEV; + } +#endif + + /* Test 4: Check if the .data section of a module is executable */ + if (test_address(&test_data)) { + printk(KERN_ERR "test_nx: .data section is executable\n"); + ret = -ENODEV; + } + +#endif + return 0; +} + +static void test_exit(void) +{ +} + +module_init(test_NX); +module_exit(test_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the NX infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c new file mode 100644 index 000000000000..4c163772000e --- /dev/null +++ b/arch/x86/kernel/test_rodata.c @@ -0,0 +1,86 @@ +/* + * test_rodata.c: functional test for mark_rodata_ro function + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <asm/sections.h> +extern int rodata_test_data; + +int rodata_test(void) +{ + unsigned long result; + unsigned long start, end; + + /* test 1: read the value */ + /* If this test fails, some previous testrun has clobbered the state */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: test 1 fails (start data)\n"); + return -ENODEV; + } + + /* test 2: write to the variable; this should fault */ + /* + * If this test fails, we managed to overwrite the data + * + * This is written in assembly to be able to catch the + * exception that is supposed to happen in the correct + * case + */ + + result = 1; + asm volatile( + "0: mov %[zero],(%[rodata_test])\n" + " mov %[zero], %[rslt]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: jmp 1b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 16\n" +#ifdef CONFIG_X86_32 + " .long 0b,2b\n" +#else + " .quad 0b,2b\n" +#endif + ".previous" + : [rslt] "=r" (result) + : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) + ); + + + if (!result) { + printk(KERN_ERR "rodata_test: test data was not read only\n"); + return -ENODEV; + } + + /* test 3: check the value hasn't changed */ + /* If this test fails, we managed to overwrite the data */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); + return -ENODEV; + } + /* test 4: check if the rodata section is 4Kb aligned */ + start = (unsigned long)__start_rodata; + end = (unsigned long)__end_rodata; + if (start & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n"); + return -ENODEV; + } + if (end & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n"); + return -ENODEV; + } + + return 0; +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 8a322c96bc23..1a89e93f3f1c 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -28,98 +28,20 @@ * serialize accesses to xtime/lost_ticks). */ -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/mm.h> +#include <linux/init.h> #include <linux/interrupt.h> #include <linux/time.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/efi.h> #include <linux/mca.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/irq.h> -#include <asm/msr.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/timer.h> -#include <asm/time.h> - -#include "mach_time.h" - -#include <linux/timex.h> - -#include <asm/hpet.h> - #include <asm/arch_hooks.h> - -#include "io_ports.h" - -#include <asm/i8259.h> +#include <asm/hpet.h> +#include <asm/time.h> #include "do_timer.h" unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); - -/* - * This is a special lock that is owned by the CPU and holds the index - * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. - */ -volatile unsigned long cmos_lock = 0; -EXPORT_SYMBOL(cmos_lock); - -/* Routines for accessing the CMOS RAM/RTC. */ -unsigned char rtc_cmos_read(unsigned char addr) -{ - unsigned char val; - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - val = inb_p(RTC_PORT(1)); - lock_cmos_suffix(addr); - return val; -} -EXPORT_SYMBOL(rtc_cmos_read); - -void rtc_cmos_write(unsigned char val, unsigned char addr) -{ - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - outb_p(val, RTC_PORT(1)); - lock_cmos_suffix(addr); -} -EXPORT_SYMBOL(rtc_cmos_write); - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval; - unsigned long flags; - - /* gets recalled with irq locally disabled */ - /* XXX - does irqsave resolve this? -johnstul */ - spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - - int timer_ack; unsigned long profile_pc(struct pt_regs *regs) @@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->ebp + 4); + return *(unsigned long *)(regs->bp + 4); #else - unsigned long *sp = (unsigned long *)®s->esp; + unsigned long *sp = (unsigned long *)®s->sp; /* Return address is either directly at stack pointer - or above a saved eflags. Eflags has bits 22-31 zero, + or above a saved flags. Eflags has bits 22-31 zero, kernel addresses don't. */ - if (sp[0] >> 22) + if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; @@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* not static: needed by APM */ -unsigned long read_persistent_clock(void) -{ - unsigned long retval; - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - - retval = get_wallclock(); - - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ void __init hpet_time_init(void) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 368b1942b39a..0380795121a6 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -11,43 +11,18 @@ * RTC support code taken from arch/i386/kernel/timers/time_hpet.c */ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> +#include <linux/clockchips.h> #include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/time.h> -#include <linux/ioport.h> +#include <linux/interrupt.h> #include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/kallsyms.h> -#include <linux/acpi.h> -#include <linux/clockchips.h> +#include <linux/time.h> -#ifdef CONFIG_ACPI -#include <acpi/achware.h> /* for PM timer frequency */ -#include <acpi/acpi_bus.h> -#endif #include <asm/i8253.h> -#include <asm/pgtable.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/proto.h> -#include <asm/hpet.h> -#include <asm/sections.h> -#include <linux/hpet.h> -#include <asm/apic.h> #include <asm/hpet.h> -#include <asm/mpspec.h> #include <asm/nmi.h> #include <asm/vgtod.h> - -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); +#include <asm/time.h> +#include <asm/timer.h> volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); /* Assume the lock function has either no stack frame or a copy - of eflags from PUSHF + of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { - unsigned long *sp = (unsigned long *)regs->rsp; + unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) @@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 - * ms after the second nowtime has started, because when nowtime is written - * into the registers of the CMOS clock, it will jump to the next second - * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data - * sheet for details. - */ - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char control, freq_select; - unsigned long flags; - -/* - * set_rtc_mmss is called when irqs are enabled, so disable irqs here - */ - spin_lock_irqsave(&rtc_lock, flags); -/* - * Tell the clock it's being set and stop it. - */ - control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE(control | RTC_SET, RTC_CONTROL); - - freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - BCD_TO_BIN(cmos_minutes); - -/* - * since we're only adjusting minutes and seconds, don't interfere with hour - * overflow. This avoids messing with unknown time zones but requires your RTC - * not to be off by more than 15 minutes. Since we're calling it only when - * our clock is externally synchronized using NTP, this shouldn't be a problem. - */ - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) >= 30) { - printk(KERN_WARNING "time.c: can't update CMOS clock " - "from %d to %d\n", cmos_minutes, real_minutes); - retval = -1; - } else { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - CMOS_WRITE(real_seconds, RTC_SECONDS); - CMOS_WRITE(real_minutes, RTC_MINUTES); - } - -/* - * The following flags have to be released exactly in this order, otherwise the - * DS12887 (popular MC146818A clone with integrated battery and quartz) will - * not reset the oscillator and will not update precisely 500 ms later. You - * won't find this mentioned in the Dallas Semiconductor data sheets, but who - * believes data sheets anyway ... -- Markus Kuhn - */ - - CMOS_WRITE(control, RTC_CONTROL); - CMOS_WRITE(freq_select, RTC_FREQ_SELECT); - - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - static irqreturn_t timer_event_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); @@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -unsigned long read_persistent_clock(void) -{ - unsigned int year, mon, day, hour, min, sec; - unsigned long flags; - unsigned century = 0; - - spin_lock_irqsave(&rtc_lock, flags); - /* - * if UIP is clear, then we have >= 244 microseconds before RTC - * registers will be updated. Spec sheet says that this is the - * reliable way to read RTC - registers invalid (off bus) during update - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - - /* now read all RTC registers while stable with interrupts disabled */ - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - spin_unlock_irqrestore(&rtc_lock, flags); - - /* - * We know that x86-64 always uses BCD format, no need to check the - * config register. - */ - - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - - if (century) { - BCD_TO_BIN(century); - year += century * 100; - printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { - /* - * x86-64 systems only exists since 2002. - * This will work up to Dec 31, 2100 - */ - year += 2000; - } - - return mktime(year, mon, day, hour, min, sec); -} - /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -static unsigned int __init tsc_calibrate_cpu_khz(void) +unsigned long __init native_calculate_cpu_khz(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void) rdtscl(tsc_start); do { rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); + tsc_now = get_cycles(); } while ((tsc_now - tsc_start) < TICK_COUNT); local_irq_restore(flags); @@ -264,20 +106,22 @@ static struct irqaction irq0 = { .name = "timer" }; -void __init time_init(void) +void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); setup_irq(0, &irq0); +} +void __init time_init(void) +{ tsc_calibrate(); cpu_khz = tsc_khz; if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 16) - cpu_khz = tsc_calibrate_cpu_khz(); + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calculate_cpu_khz(); if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); @@ -290,4 +134,5 @@ void __init time_init(void) printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); + late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c new file mode 100644 index 000000000000..6dfd4e76661a --- /dev/null +++ b/arch/x86/kernel/tls.c @@ -0,0 +1,213 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/user.h> +#include <linux/regset.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/proto.h> + +#include "tls.h" + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(&t->tls_array[idx])) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static void set_tls_desc(struct task_struct *p, int idx, + const struct user_desc *info, int n) +{ + struct thread_struct *t = &p->thread; + struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN]; + int cpu; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + while (n-- > 0) { + if (LDT_empty(info)) + desc->a = desc->b = 0; + else + fill_ldt(desc, info); + ++info; + ++desc; + } + + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); +} + +/* + * Set a given TLS descriptor: + */ +int do_set_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info, + int can_allocate) +{ + struct user_desc info; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + + if (idx == -1) + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1 && can_allocate) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + set_tls_desc(p, idx, &info, 1); + + return 0; +} + +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + return do_set_thread_area(current, -1, u_info, 1); +} + + +/* + * Get the current Thread-Local Storage area: + */ + +static void fill_user_desc(struct user_desc *info, int idx, + const struct desc_struct *desc) + +{ + memset(info, 0, sizeof(*info)); + info->entry_number = idx; + info->base_addr = get_desc_base(desc); + info->limit = get_desc_limit(desc); + info->seg_32bit = desc->d; + info->contents = desc->type >> 2; + info->read_exec_only = !(desc->type & 2); + info->limit_in_pages = desc->g; + info->seg_not_present = !desc->p; + info->useable = desc->avl; +#ifdef CONFIG_X86_64 + info->lm = desc->l; +#endif +} + +int do_get_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info) +{ + struct user_desc info; + + if (idx == -1 && get_user(idx, &u_info->entry_number)) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + fill_user_desc(&info, idx, + &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + return do_get_thread_area(current, -1, u_info); +} + +int regset_tls_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct thread_struct *t = &target->thread; + int n = GDT_ENTRY_TLS_ENTRIES; + while (n > 0 && desc_empty(&t->tls_array[n - 1])) + --n; + return n; +} + +int regset_tls_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + const struct desc_struct *tls; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + pos /= sizeof(struct user_desc); + count /= sizeof(struct user_desc); + + tls = &target->thread.tls_array[pos]; + + if (kbuf) { + struct user_desc *info = kbuf; + while (count-- > 0) + fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, + tls++); + } else { + struct user_desc __user *u_info = ubuf; + while (count-- > 0) { + struct user_desc info; + fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); + if (__copy_to_user(u_info++, &info, sizeof(info))) + return -EFAULT; + } + } + + return 0; +} + +int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct user_desc *info; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + if (kbuf) + info = kbuf; + else if (__copy_from_user(infobuf, ubuf, count)) + return -EFAULT; + else + info = infobuf; + + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), + info, count / sizeof(struct user_desc)); + + return 0; +} diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h new file mode 100644 index 000000000000..2f083a2fe216 --- /dev/null +++ b/arch/x86/kernel/tls.h @@ -0,0 +1,21 @@ +/* + * Internal declarations for x86 TLS implementation functions. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * Red Hat Author: Roland McGrath. + */ + +#ifndef _ARCH_X86_KERNEL_TLS_H + +#include <linux/regset.h> + +extern user_regset_active_fn regset_tls_active; +extern user_regset_get_fn regset_tls_get; +extern user_regset_set_fn regset_tls_set; + +#endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 7e16d675eb85..78cbb655aa79 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -31,9 +31,10 @@ #include <linux/mmzone.h> #include <asm/cpu.h> -static struct i386_cpu cpu_devices[NR_CPUS]; +static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); -int __cpuinit arch_register_cpu(int num) +#ifdef CONFIG_HOTPLUG_CPU +int arch_register_cpu(int num) { /* * CPU0 cannot be offlined due to several @@ -44,21 +45,23 @@ int __cpuinit arch_register_cpu(int num) * Also certain PCI quirks require not to enable hotplug control * for all CPU's. */ -#ifdef CONFIG_HOTPLUG_CPU if (num) - cpu_devices[num].cpu.hotpluggable = 1; -#endif - - return register_cpu(&cpu_devices[num].cpu, num); + per_cpu(cpu_devices, num).cpu.hotpluggable = 1; + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } +EXPORT_SYMBOL(arch_register_cpu); -#ifdef CONFIG_HOTPLUG_CPU void arch_unregister_cpu(int num) { - return unregister_cpu(&cpu_devices[num].cpu); + return unregister_cpu(&per_cpu(cpu_devices, num).cpu); } -EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); +#else +int arch_register_cpu(int num) +{ + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); +} +EXPORT_SYMBOL(arch_register_cpu); #endif /*CONFIG_HOTPLUG_CPU*/ static int __init topology_init(void) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 02d1e1e58e81..3cf72977d012 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -76,7 +76,8 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -101,6 +102,34 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +void printk_address(unsigned long address, int reliable) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[128]; + char reliab[4] = ""; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%08lx>]\n", address); + return; + } + if (!reliable) + strcpy(reliab, "? "); + + if (!modname) + modname = delim = ""; + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); +#else + printk(" [<%08lx>]\n", address); +#endif +} + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) { return p > (void *)tinfo && @@ -114,48 +143,35 @@ struct stack_frame { }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long ebp, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)ebp; - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) { - struct stack_frame *next; - unsigned long addr; + struct stack_frame *frame = (struct stack_frame *)bp; - addr = frame->return_address; - ops->address(data, addr); - /* - * break out of recursive entries (such as - * end_of_stack_stop_unwind_function). Also, - * we can never allow a frame pointer to - * move downwards! - */ - next = frame->next_frame; - if (next <= frame) - break; - frame = next; - } -#else while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { unsigned long addr; - addr = *stack++; - if (__kernel_text_address(addr)) - ops->address(data, addr); + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 4) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; } -#endif - return ebp; + return bp; } #define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - unsigned long ebp = 0; - if (!task) task = current; @@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long dummy; stack = &dummy; if (task != current) - stack = (unsigned long *)task->thread.esp; + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER - if (!ebp) { + if (!bp) { if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); + /* Grab bp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (bp) : ); } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; } } #endif @@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, ops, data); + bp = print_context_stack(context, stack, bp, ops, data); /* Should be after the line below, but somewhere in early boot context comes out corrupted and we can't reference it -AK */ @@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { printk("%s [<%08lx>] ", (char *)data, addr); + if (!reliable) + printk("? "); print_symbol("%s\n", addr); touch_nmi_watchdog(); } @@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack) + unsigned long *stack, unsigned long bp) { - show_trace_log_lvl(task, regs, stack, ""); + show_trace_log_lvl(task, regs, stack, bp, ""); } static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *esp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; - if (esp == NULL) { + if (sp == NULL) { if (task) - esp = (unsigned long*)task->thread.esp; + sp = (unsigned long*)task->thread.sp; else - esp = (unsigned long *)&esp; + sp = (unsigned long *)&sp; } - stack = esp; + stack = sp; for(i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; @@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); - show_trace_log_lvl(task, regs, esp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack(struct task_struct *task, unsigned long *sp) { printk(" "); - show_stack_log_lvl(task, NULL, esp, ""); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp) void dump_stack(void) { unsigned long stack; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movl %%ebp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(current, NULL, &stack); + show_trace(current, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (!user_mode_vm(regs)) { - u8 *eip; + u8 *ip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - code_prologue; - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { /* try starting at EIP */ - eip = (u8 *)regs->eip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_len; i++, eip++) { - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 *)regs->eip) + if (ip == (u8 *)regs->ip) printk("<%02x> ", c); else printk("%02x ", c); @@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs) printk("\n"); } -int is_valid_bugaddr(unsigned long eip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (eip < PAGE_OFFSET) + if (ip < PAGE_OFFSET) return 0; - if (probe_kernel_address((unsigned short *)eip, ud2)) + if (probe_kernel_address((unsigned short *)ip, ud2)) return 0; return ud2 == 0x0b0f; } +static int die_counter; + +int __kprobes __die(const char * str, struct pt_regs * regs, long err) +{ + unsigned long sp; + unsigned short ss; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) != + NOTIFY_STOP) { + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; + } else { + return 1; + } +} + /* * This is gone through when something in the kernel has done something bad and * is about to be terminated. @@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err) .lock_owner = -1, .lock_owner_depth = 0 }; - static int die_counter; unsigned long flags; oops_enter(); @@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err) raw_local_irq_save(flags); if (++die.lock_owner_depth < 3) { - unsigned long esp; - unsigned short ss; + report_bug(regs->ip, regs); - report_bug(regs->eip, regs); - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, - ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != - NOTIFY_STOP) { - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - esp = (unsigned long) (®s->esp); - savesegment(ss, ss); - if (user_mode(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); - print_symbol("%s", regs->eip); - printk(" SS:ESP %04x:%08lx\n", ss, esp); - } - else + if (__die(str, regs, err)) regs = NULL; - } else + } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } bust_spinlocks(0); die.lock_owner = -1; @@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, { struct task_struct *tsk = current; - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; @@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } #define DO_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ if (irq) \ @@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_VM86_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -534,7 +566,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -548,13 +580,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) @@ -562,7 +594,7 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) -fastcall void __kprobes do_general_protection(struct pt_regs * regs, +void __kprobes do_general_protection(struct pt_regs * regs, long error_code) { int cpu = get_cpu(); @@ -596,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, } put_cpu(); - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto gp_in_vm86; if (!user_mode(regs)) @@ -605,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", current->comm, task_pid_nr(current), - regs->eip, regs->esp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return; @@ -705,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg) */ bust_spinlocks(1); printk(KERN_EMERG "%s", msg); - printk(" on CPU%d, eip %08lx, registers:\n", - smp_processor_id(), regs->eip); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); show_registers(regs); console_silent(); spin_unlock(&nmi_print_lock); @@ -763,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs) static int ignore_nmis; -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +__kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -792,7 +827,7 @@ void restart_nmi(void) } #ifdef CONFIG_KPROBES -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) +void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); @@ -828,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) * find every occurrence of the TF bit that could be saved away even * by user code) */ -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) +void __kprobes do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -837,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg[7]) + if (!tsk->thread.debugreg7) goto clear_dr7; } - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto debug_vm86; /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg[6] = condition; + tsk->thread.debugreg6 = condition; /* * Single-stepping through TF: make sure we ignore any events in @@ -886,7 +927,7 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; return; } @@ -895,7 +936,7 @@ clear_TF_reenable: * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(void __user *eip) +void math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -911,7 +952,7 @@ void math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -954,13 +995,13 @@ void math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +void do_coprocessor_error(struct pt_regs * regs, long error_code) { ignore_fpu_irq = 1; - math_error((void __user *)regs->eip); + math_error((void __user *)regs->ip); } -static void simd_math_error(void __user *eip) +static void simd_math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -976,7 +1017,7 @@ static void simd_math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1008,19 +1049,19 @@ static void simd_math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_simd_coprocessor_error(struct pt_regs * regs, +void do_simd_coprocessor_error(struct pt_regs * regs, long error_code) { if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->eip); + simd_math_error((void __user *)regs->ip); } else { /* * Handle strange cache flush from user space exception * in all other cases. This is undocumented behaviour. */ - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); return; @@ -1032,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs, } } -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, +void do_spurious_interrupt_bug(struct pt_regs * regs, long error_code) { #if 0 @@ -1041,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } -fastcall unsigned long patch_espfix_desc(unsigned long uesp, +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; @@ -1095,51 +1136,17 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -void set_intr_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); -} - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); -} - -static void __init set_trap_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); -} - -static void __init set_system_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); -} - -static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); -} - void __init trap_init(void) { int i; #ifdef CONFIG_EISA - void __iomem *p = ioremap(0x0FFFD9, 4); + void __iomem *p = early_ioremap(0x0FFFD9, 4); if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { EISA_bus = 1; } - iounmap(p); + early_iounmap(p, 4); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index cc68b92316cd..efc66df728b6 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -74,22 +74,24 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +static unsigned int code_bytes = 64; + static inline void conditional_sti(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_sti(struct pt_regs *regs) { preempt_disable(); - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_cli(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ @@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) int kstack_depth_to_print = 12; -#ifdef CONFIG_KALLSYMS -void printk_address(unsigned long address) +void printk_address(unsigned long address, int reliable) { +#ifdef CONFIG_KALLSYMS unsigned long offset = 0, symsize; const char *symname; char *modname; char *delim = ":"; - char namebuf[128]; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -113,17 +116,17 @@ void printk_address(unsigned long address) printk(" [<%016lx>]\n", address); return; } + if (!reliable) + strcpy(reliab, "? "); + if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", - address, delim, modname, delim, symname, offset, symsize); -} + modname = delim = ""; + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); #else -void printk_address(unsigned long address) -{ printk(" [<%016lx>]\n", address); -} #endif +} static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { - void *t = (void *)tinfo; - return p > t && p < t + THREAD_SIZE - 3; + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 8) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; } void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (!tsk) tsk = current; + tinfo = task_thread_info(tsk); if (!stack) { unsigned long dummy; stack = &dummy; if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.rsp; + stack = (unsigned long *)tsk->thread.sp; } - /* - * Print function call entries within a stack. 'cond' is the - * "end of stackframe" condition, that the 'stack++' - * iteration will eventually trigger. - */ -#define HANDLE_STACK(cond) \ - do while (cond) { \ - unsigned long addr = *stack++; \ - /* Use unlocked access here because except for NMIs \ - we should be already protected against module unloads */ \ - if (__kernel_text_address(addr)) { \ - /* \ - * If the address is either in the text segment of the \ - * kernel, or in the region which contains vmalloc'ed \ - * memory, it *may* be the address of a calling \ - * routine; if so, print it so that someone tracing \ - * down the cause of the crash will be able to figure \ - * out the call path that was taken. \ - */ \ - ops->address(data, addr); \ - } \ - } while (0) +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (tsk == current) { + /* Grab bp right from our regs */ + asm("movq %%rbp, %0" : "=r" (bp):); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) tsk->thread.sp; + } + } +#endif + + /* * Print function call entries in all stacks, starting at the @@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (estack_end) { if (ops->stack(data, id) < 0) break; - HANDLE_STACK (stack < estack_end); + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (stack >= irqstack && stack < irqstack_end) { if (ops->stack(data, "IRQ") < 0) break; - HANDLE_STACK (stack < irqstack_end); + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); /* * We link to the next stack (which would be * the process stack normally) the last @@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, /* * This handles the process stack: */ - tinfo = task_thread_info(tsk); - HANDLE_STACK (valid_stack_ptr(tinfo, stack)); -#undef HANDLE_STACK + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name) return 0; } -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); - printk_address(addr); + printk_address(addr, reliable); } static const struct stacktrace_ops print_trace_ops = { @@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = { }; void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, + unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, &print_trace_ops, NULL); + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, + unsigned long bp) { unsigned long *stack; int i; @@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) // debugging aid: "show_stack(NULL, NULL);" prints the // back trace for this cpu. - if (rsp == NULL) { + if (sp == NULL) { if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; + sp = (unsigned long *)tsk->thread.sp; else - rsp = (unsigned long *)&rsp; + sp = (unsigned long *)&sp; } - stack = rsp; + stack = sp; for(i=0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { @@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, rsp); + show_trace(tsk, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +void show_stack(struct task_struct *tsk, unsigned long * sp) { - _show_stack(tsk, NULL, rsp); + _show_stack(tsk, NULL, sp, 0); } /* @@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) void dump_stack(void) { unsigned long dummy; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movq %%rbp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy); + show_trace(NULL, NULL, &dummy, bp); } EXPORT_SYMBOL(dump_stack); @@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = !user_mode(regs); - unsigned long rsp; + unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; + u8 *ip; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; - rsp = regs->rsp; + sp = regs->sp; + ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs) * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (in_kernel) { + if (!user_mode(regs)) { + unsigned char c; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long*)rsp); - - printk("\nCode: "); - if (regs->rip < PAGE_OFFSET) - goto bad; - - for (i=0; i<20; i++) { - unsigned char c; - if (__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + printk("\n"); + + printk(KERN_EMERG "Code: "); + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at RIP */ + ip = (u8 *) regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad RIP value."); break; } - printk("%02x ", c); + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); } -int is_valid_bugaddr(unsigned long rip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) return 0; return ud2 == 0x0b0f; } -#ifdef CONFIG_BUG -void out_of_line_bug(void) -{ - BUG(); -} -EXPORT_SYMBOL(out_of_line_bug); -#endif - static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void) return flags; } -void __kprobes oops_end(unsigned long flags) +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { die_owner = -1; bust_spinlocks(0); @@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } if (panic_on_oops) panic("Fatal exception"); oops_exit(); + do_exit(signr); } -void __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); @@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); if (kexec_should_crash(current)) crash_kexec(regs); + return 0; } void die(const char * str, struct pt_regs * regs, long err) @@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long flags = oops_begin(); if (!user_mode(regs)) - report_bug(regs->rip, regs); + report_bug(regs->ip, regs); - __die(str, regs, err); - oops_end(flags); - do_exit(SIGSEGV); + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); } void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) @@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags); + oops_end(flags, NULL, SIGBUS); nmi_exit(); local_irq_enable(); - do_exit(SIGSEGV); + do_exit(SIGBUS); } static void __kprobes do_trap(int trapnr, int signr, char *str, @@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.trap_no = trapnr; if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } if (info) force_sig_info(signr, info, tsk); @@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, } - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) - regs->rip = fixup->fixup; - else { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); } + return; } #define DO_ERROR(trapnr, signr, str, name) \ @@ -643,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_ERROR( 4, SIGSEGV, "overflow", overflow) DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) @@ -694,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, tsk); return; } - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } + if (fixup_exception(regs)) + return; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); } static __kprobes void @@ -832,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->rsp) + if (eregs == (struct pt_regs *)eregs->sp) ; /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ - else if (eregs->eflags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) *regs = *eregs; return regs; @@ -858,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; @@ -873,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, tsk->thread.debugreg6 = condition; - /* Mask out spurious TF errors due to lazy TF clearing */ + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ if (!user_mode(regs)) goto clear_TF_reenable; - /* - * Was the TF flag set by a debugger? If so, clear it now, - * so that register information is correct. - */ - if (tsk->ptrace & PT_DTRACE) { - regs->eflags &= ~TF_MASK; - tsk->ptrace &= ~PT_DTRACE; - } } /* Ok, finally something we can handle */ @@ -902,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: @@ -912,18 +948,15 @@ clear_dr7: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; + if (fixup_exception(regs)) return 1; - } + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); /* Illegal floating point operation in the kernel */ current->thread.trap_no = trapnr; @@ -938,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) */ asmlinkage void do_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short cwd, swd; @@ -958,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -1007,7 +1040,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short mxcsr; @@ -1027,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1089,6 +1122,7 @@ asmlinkage void math_state_restore(void) task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } +EXPORT_SYMBOL_GPL(math_state_restore); void __init trap_init(void) { @@ -1144,3 +1178,14 @@ static int __init kstack_setup(char *s) return 0; } early_param("kstack", kstack_setup); + + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 9ebc0dab66b4..43517e324be8 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -5,6 +5,7 @@ #include <linux/jiffies.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/percpu.h> #include <asm/delay.h> #include <asm/tsc.h> @@ -23,8 +24,6 @@ static int tsc_enabled; unsigned int tsc_khz; EXPORT_SYMBOL_GPL(tsc_khz); -int tsc_disable; - #ifdef CONFIG_X86_TSC static int __init tsc_setup(char *str) { @@ -39,8 +38,7 @@ static int __init tsc_setup(char *str) */ static int __init tsc_setup(char *str) { - tsc_disable = 1; - + setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif @@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long cpu_khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + /* + * Start smoothly with the new frequency: + */ + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); } /* @@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz); + preempt_disable(); + set_cyc2ns_scale(cpu_khz, smp_processor_id()); + preempt_enable(); /* * TSC based sched_clock turns * to junk w/ cpufreq @@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void) { if (!cpu_has_tsc || tsc_unstable) return 1; + + /* Anything with constant TSC should be synchronized */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { } void __init tsc_init(void) { - if (!cpu_has_tsc || tsc_disable) + int cpu; + + if (!cpu_has_tsc) goto out_no_tsc; cpu_khz = calculate_cpu_khz(); @@ -380,7 +405,15 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); - set_cyc2ns_scale(cpu_khz); + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + use_tsc_delay(); /* Check and install the TSC clocksource */ @@ -403,10 +436,5 @@ void __init tsc_init(void) return; out_no_tsc: - /* - * Set the tsc_disable flag if there's no TSC support, this - * makes it a fast flag for the kernel to see whether it - * should be using the TSC. - */ - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 9c70af45b42b..947554ddabb6 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -10,6 +10,7 @@ #include <asm/hpet.h> #include <asm/timex.h> +#include <asm/timer.h> static int notsc __initdata = 0; @@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz); unsigned int tsc_khz; EXPORT_SYMBOL(tsc_khz); -static unsigned int cyc2ns_scale __read_mostly; +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; -} + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; -static unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); } -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long a = 0; @@ -44,12 +77,27 @@ unsigned long long sched_clock(void) return cycles_2_ns(a); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + + static int tsc_unstable; -inline int check_tsc_unstable(void) +int check_tsc_unstable(void) { return tsc_unstable; } +EXPORT_SYMBOL_GPL(check_tsc_unstable); + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref); + preempt_disable(); + set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); + preempt_enable(); return 0; } @@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm, int i; for (i = 0; i < MAX_RETRIES; i++) { - t1 = get_cycles_sync(); + t1 = get_cycles(); if (hpet) *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *pm = acpi_pm_read_early(); - t2 = get_cycles_sync(); + t2 = get_cycles(); if ((t2 - t1) < SMI_TRESHOLD) return t2; } @@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm, void __init tsc_calibrate(void) { unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); + int hpet = is_hpet_enabled(), cpu; local_irq_save(flags); @@ -162,9 +212,9 @@ void __init tsc_calibrate(void) outb(0xb0, 0x43); outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles_sync(); + tr1 = get_cycles(); while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles_sync(); + tr2 = get_cycles(); tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); @@ -206,7 +256,9 @@ void __init tsc_calibrate(void) } tsc_khz = tsc2 / tsc1; - set_cyc2ns_scale(tsc_khz); + + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu); } /* @@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void) if (apic_is_clustered_box()) return 1; #endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && - acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; - } /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; @@ -250,13 +294,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ static cycle_t read_tsc(void) { - cycle_t ret = (cycle_t)get_cycles_sync(); + cycle_t ret = (cycle_t)get_cycles(); return ret; } static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)get_cycles_sync(); + cycle_t ret = (cycle_t)vget_cycles(); return ret; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9125efe66a06..0577825cf89b 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void) cycles_t start, now, prev, end; int i; - start = get_cycles_sync(); + start = get_cycles(); /* * The measurement runs for 20 msecs: */ @@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void) */ __raw_spin_lock(&sync_lock); prev = last_tsc; - now = get_cycles_sync(); + now = get_cycles(); last_tsc = now; __raw_spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether - * measurement is done [we also insert a 100 million + * measurement is done [we also insert a 10 million * loops safety exit, so we dont lock up in case the * TSC readout is totally broken]): */ if (unlikely(!(i & 7))) { - if (now > end || i > 100000000) + if (now > end || i > 10000000) break; cpu_relax(); touch_nmi_watchdog(); @@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void) nr_warps++; __raw_spin_unlock(&sync_lock); } - + } + if (!(now-start)) { + printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + now-start, end-start); + WARN_ON(1); } } @@ -129,24 +133,24 @@ void __cpuinit check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - /* - * Reset it - just in case we boot another CPU later: - */ - atomic_set(&start_count, 0); - if (nr_warps) { printk("\n"); printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," " turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); - nr_warps = 0; - max_warp = 0; - last_tsc = 0; } else { printk(" passed.\n"); } /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + + /* * Let the target continue with the bootup: */ atomic_inc(&stop_count); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 157e4bedd3c5..738c2104df30 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -70,10 +70,10 @@ /* * 8- and 16-bit register defines.. */ -#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) -#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) -#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) -#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) +#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.ip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.sp)) /* * virtual flags (16 and 32-bit versions) @@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user, { int ret = 0; - /* kernel_vm86_regs is missing xgs, so copy everything up to + /* kernel_vm86_regs is missing gs, so copy everything up to (but not including) orig_eax, and then rest including orig_eax. */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); - ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_ax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_eax)); + offsetof(struct kernel_vm86_regs, pt.orig_ax)); return ret; } @@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, { int ret = 0; - /* copy eax-xfs inclusive */ - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); - /* copy orig_eax-__gsh+extra */ - ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, + /* copy ax-fs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + /* copy orig_ax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_ax, &user->orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_eax) + + offsetof(struct kernel_vm86_regs, pt.orig_ax) + extra); return ret; } -struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); -struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) +struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) { struct tss_struct *tss; struct pt_regs *ret; @@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { @@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) } tss = &per_cpu(init_tss, get_cpu()); - current->thread.esp0 = current->thread.saved_esp0; + current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; - load_esp0(tss, ¤t->thread); - current->thread.saved_esp0 = 0; + load_sp0(tss, ¤t->thread); + current->thread.saved_sp0 = 0; put_cpu(); ret = KVM86->regs32; - ret->xfs = current->thread.saved_fs; + ret->fs = current->thread.saved_fs; loadsegment(gs, current->thread.saved_gs); return ret; @@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk asmlinkage int sys_vm86old(struct pt_regs regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; + struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs) int tmp, ret = -EPERM; tsk = current; - if (tsk->thread.saved_esp0) + if (tsk->thread.saved_sp0) goto out; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, vm86plus) - @@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs.ebx) { + switch (regs.bx) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); + ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); goto out; case VM86_PLUS_INSTALL_CHECK: /* NOTE: on old vm86 stuff this will return the error @@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs) /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ ret = -EPERM; - if (tsk->thread.saved_esp0) + if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs.ecx; + v86 = (struct vm86plus_struct __user *)regs.cx; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); @@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk /* * make sure the vm86() system call doesn't try to do anything silly */ - info->regs.pt.xds = 0; - info->regs.pt.xes = 0; - info->regs.pt.xfs = 0; + info->regs.pt.ds = 0; + info->regs.pt.es = 0; + info->regs.pt.fs = 0; /* we are clearing gs later just before "jmp resume_userspace", * because it is not saved/restored. */ /* - * The eflags register is also special: we cannot trust that the user + * The flags register is also special: we cannot trust that the user * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.pt.eflags; - info->regs.pt.eflags &= SAFE_MASK; - info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; - info->regs.pt.eflags |= VM_MASK; + VEFLAGS = info->regs.pt.flags; + info->regs.pt.flags &= SAFE_MASK; + info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; + info->regs.pt.flags |= VM_MASK; switch (info->cpu_type) { case CPU_286: @@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%eax) to 0 + * Save old state, set default return value (%ax) to 0 */ - info->regs32->eax = 0; - tsk->thread.saved_esp0 = tsk->thread.esp0; - tsk->thread.saved_fs = info->regs32->xfs; + info->regs32->ax = 0; + tsk->thread.saved_sp0 = tsk->thread.sp0; + tsk->thread.saved_fs = info->regs32->fs; savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); - tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; - load_esp0(tss, &tsk->thread); + load_sp0(tss, &tsk->thread); put_cpu(); tsk->thread.screen_bitmap = info->screen_bitmap; @@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval) struct pt_regs * regs32; regs32 = save_v86_state(regs16); - regs32->eax = retval; + regs32->ax = retval; __asm__ __volatile__("movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" "jmp resume_userspace" @@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs) static inline void clear_TF(struct kernel_vm86_regs * regs) { - regs->pt.eflags &= ~TF_MASK; + regs->pt.flags &= ~TF_MASK; } static inline void clear_AC(struct kernel_vm86_regs * regs) { - regs->pt.eflags &= ~AC_MASK; + regs->pt.flags &= ~AC_MASK; } /* It is correct to call set_IF(regs) from the set_vflags_* * functions. However someone forgot to call clear_IF(regs) * in the opposite case. * After the command sequence CLI PUSHF STI POPF you should - * end up with interrups disabled, but you ended up with + * end up with interrupts disabled, but you ended up with * interrupts enabled. * ( I was testing my own changes, but the only bug I * could find was in a function I had not changed. ) * [KD] */ -static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) +static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs) { - set_flags(VEFLAGS, eflags, current->thread.v86mask); - set_flags(regs->pt.eflags, eflags, SAFE_MASK); - if (eflags & IF_MASK) + set_flags(VEFLAGS, flags, current->thread.v86mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & IF_MASK) set_IF(regs); else clear_IF(regs); @@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { set_flags(VFLAGS, flags, current->thread.v86mask); - set_flags(regs->pt.eflags, flags, SAFE_MASK); + set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); else @@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) { - unsigned long flags = regs->pt.eflags & RETURN_MASK; + unsigned long flags = regs->pt.flags & RETURN_MASK; if (VEFLAGS & VIF_MASK) flags |= IF_MASK; @@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, unsigned long __user *intr_ptr; unsigned long segoffs; - if (regs->pt.xcs == BIOSSEG) + if (regs->pt.cs == BIOSSEG) goto cannot_handle; if (is_revectored(i, &KVM86->int_revectored)) goto cannot_handle; @@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i, if ((segoffs >> 16) == BIOSSEG) goto cannot_handle; pushw(ssp, sp, get_vflags(regs), cannot_handle); - pushw(ssp, sp, regs->pt.xcs, cannot_handle); + pushw(ssp, sp, regs->pt.cs, cannot_handle); pushw(ssp, sp, IP(regs), cannot_handle); - regs->pt.xcs = segoffs >> 16; + regs->pt.cs = segoffs >> 16; SP(regs) -= 6; IP(regs) = segoffs & 0xffff; clear_TF(regs); @@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno if (VMPI.is_vm86pus) { if ( (trapno==3) || (trapno==1) ) return_to_32bit(regs, VM86_TRAP + (trapno << 8)); - do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); + do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); return 0; } if (trapno !=1) @@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) handle_vm86_trap(regs, 0, 1); \ return; } while (0) - orig_flags = *(unsigned short *)®s->pt.eflags; + orig_flags = *(unsigned short *)®s->pt.flags; - csp = (unsigned char __user *) (regs->pt.xcs << 4); - ssp = (unsigned char __user *) (regs->pt.xss << 4); + csp = (unsigned char __user *) (regs->pt.cs << 4); + ssp = (unsigned char __user *) (regs->pt.ss << 4); sp = SP(regs); ip = IP(regs); @@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) SP(regs) += 6; } IP(regs) = newip; - regs->pt.xcs = newcs; + regs->pt.cs = newcs; CHECK_IF_IN_TRAP; if (data32) { set_vflags_long(newflags, regs); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index f02bad68abaa..4525bc2c2e19 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -62,7 +62,10 @@ static struct { void (*cpuid)(void /* non-c */); void (*_set_ldt)(u32 selector); void (*set_tr)(u32 selector); - void (*set_kernel_stack)(u32 selector, u32 esp0); + void (*write_idt_entry)(struct desc_struct *, int, u32, u32); + void (*write_gdt_entry)(struct desc_struct *, int, u32, u32); + void (*write_ldt_entry)(struct desc_struct *, int, u32, u32); + void (*set_kernel_stack)(u32 selector, u32 sp0); void (*allocate_page)(u32, u32, u32, u32, u32); void (*release_page)(u32, u32); void (*set_pte)(pte_t, pte_t *, unsigned); @@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops; #define IRQ_PATCH_DISABLE 5 static inline void patch_offset(void *insnbuf, - unsigned long eip, unsigned long dest) + unsigned long ip, unsigned long dest) { - *(unsigned long *)(insnbuf+1) = dest-eip-5; + *(unsigned long *)(insnbuf+1) = dest-ip-5; } static unsigned patch_internal(int call, unsigned len, void *insnbuf, - unsigned long eip) + unsigned long ip) { u64 reloc; struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; @@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf, case VMI_RELOCATION_CALL_REL: BUG_ON(len < 5); *(char *)insnbuf = MNEM_CALL; - patch_offset(insnbuf, eip, (unsigned long)rel->eip); + patch_offset(insnbuf, ip, (unsigned long)rel->eip); return 5; case VMI_RELOCATION_JUMP_REL: BUG_ON(len < 5); *(char *)insnbuf = MNEM_JMP; - patch_offset(insnbuf, eip, (unsigned long)rel->eip); + patch_offset(insnbuf, ip, (unsigned long)rel->eip); return 5; case VMI_RELOCATION_NOP: @@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf, * sequence. The callee does nop padding for us. */ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, - unsigned long eip, unsigned len) + unsigned long ip, unsigned len) { switch (type) { case PARAVIRT_PATCH(pv_irq_ops.irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_cpu_ops.iret): - return patch_internal(VMI_CALL_IRET, len, insns, eip); - case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): - return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); + return patch_internal(VMI_CALL_IRET, len, insns, ip); + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): + return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); default: break; } @@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, } /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ -static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void vmi_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { int override = 0; - if (*eax == 1) + if (*ax == 1) override = 1; asm volatile ("call *%6" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid)); if (override) { if (disable_pse) - *edx &= ~X86_FEATURE_PSE; + *dx &= ~X86_FEATURE_PSE; if (disable_pge) - *edx &= ~X86_FEATURE_PGE; + *dx &= ~X86_FEATURE_PGE; if (disable_sep) - *edx &= ~X86_FEATURE_SEP; + *dx &= ~X86_FEATURE_SEP; if (disable_tsc) - *edx &= ~X86_FEATURE_TSC; + *dx &= ~X86_FEATURE_TSC; if (disable_mtrr) - *edx &= ~X86_FEATURE_MTRR; + *dx &= ~X86_FEATURE_MTRR; } } static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) { if (gdt[nr].a != new->a || gdt[nr].b != new->b) - write_gdt_entry(gdt, nr, new->a, new->b); + write_gdt_entry(gdt, nr, new, 0); } static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) @@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) static void vmi_set_ldt(const void *addr, unsigned entries) { unsigned cpu = smp_processor_id(); - u32 low, high; + struct desc_struct desc; - pack_descriptor(&low, &high, (unsigned long)addr, + pack_descriptor(&desc, (unsigned long)addr, entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); + DESC_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT); vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); } @@ -214,17 +217,37 @@ static void vmi_set_tr(void) vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); } -static void vmi_load_esp0(struct tss_struct *tss, +static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) +{ + u32 *idt_entry = (u32 *)g; + vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]); +} + +static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + u32 *gdt_entry = (u32 *)desc; + vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]); +} + +static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, + const void *desc) +{ + u32 *ldt_entry = (u32 *)desc; + vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]); +} + +static void vmi_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->x86_tss.esp0 = thread->esp0; + tss->x86_tss.sp0 = thread->sp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0); } static void vmi_flush_tlb_user(void) @@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pd(u32 pfn) +static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) { /* * This call comes in very early, before mem_map is setup. @@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE - const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; + const pte_t pte = { .pte = pmdval.pmd }; vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; @@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ - const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; + const pte_t pte = { .pte = pudval.pgd.pgd }; vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - const pte_t pte = { 0 }; + const pte_t pte = { .pte = 0 }; vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { - const pte_t pte = { 0 }; + const pte_t pte = { .pte = 0 }; vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } @@ -790,10 +813,13 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.store_idt, GetIDT); para_fill(pv_cpu_ops.store_tr, GetTR); pv_cpu_ops.load_tls = vmi_load_tls; - para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); - para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); - para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); - para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); + para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry, + write_ldt_entry, WriteLDTEntry); + para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry, + write_gdt_entry, WriteGDTEntry); + para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry, + write_idt_entry, WriteIDTEntry); + para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack); para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); @@ -870,7 +896,7 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; + pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP @@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg) return -EINVAL; if (!strcmp(arg, "disable_pge")) { - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); disable_pge = 1; } else if (!strcmp(arg, "disable_pse")) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE); disable_pse = 1; } else if (!strcmp(arg, "disable_sep")) { - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); disable_sep = 1; } else if (!strcmp(arg, "disable_tsc")) { - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC); disable_tsc = 1; } else if (!strcmp(arg, "disable_mtrr")) { - clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR); disable_mtrr = 1; } else if (!strcmp(arg, "disable_timer")) { disable_vmi_timer = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index b1b5ab08b26e..a2b030780aa9 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -35,7 +35,6 @@ #include <asm/i8253.h> #include <irq_vectors.h> -#include "io_ports.h" #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) @@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ - outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 7d72cce00529..f1148ac8abe3 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -8,12 +8,6 @@ * put it inside the section definition. */ -/* Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ #define LOAD_OFFSET __PAGE_OFFSET #include <asm-generic/vmlinux.lds.h> @@ -44,6 +38,8 @@ SECTIONS /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + . = ALIGN(4096); /* not really needed, already page aligned */ + *(.text.page_aligned) TEXT_TEXT SCHED_TEXT LOCK_TEXT @@ -131,10 +127,12 @@ SECTIONS .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { __init_begin = .; _sinittext = .; - *(.init.text) + INIT_TEXT _einittext = .; } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } . = ALIGN(16); .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { __setup_start = .; @@ -169,8 +167,12 @@ SECTIONS } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } #if defined(CONFIG_BLK_DEV_INITRD) . = ALIGN(4096); .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index ba8ea97abd21..0992b9946c6f 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -37,16 +37,15 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } :text = 0x9090 - /* out-of-line lock text */ - .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } - - _etext = .; /* End of text section */ + _etext = .; /* End of text section */ + } :text = 0x9090 . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } NOTES :text :note @@ -155,12 +154,15 @@ SECTIONS __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; - *(.init.text) + INIT_TEXT _einittext = .; } - __initdata_begin = .; - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } - __initdata_end = .; + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + __initdata_begin = .; + INIT_DATA + __initdata_end = .; + } + . = ALIGN(16); __setup_start = .; .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } @@ -176,6 +178,14 @@ SECTIONS } __con_initcall_end = .; SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + . = ALIGN(8); __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { @@ -187,8 +197,12 @@ SECTIONS } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } /* vdso blob that is mapped into user space */ vdso_start = . ; diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 414caf0c5f9a..d971210a6d36 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -25,21 +25,24 @@ static int __init vsmp_init(void) return 0; /* Check if we are running on a ScaleMP vSMP box */ - if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || - (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) + if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != + PCI_VENDOR_ID_SCALEMP) || + (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != + PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) return 0; /* set vSMP magic bits to indicate vSMP capable kernel */ address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); cap = readl(address); ctl = readl(address + 4); - printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", + cap, ctl); if (cap & ctl & (1 << 4)) { /* Turn on vSMP IRQ fastpath handling (see system.h) */ ctl &= ~(1 << 4); writel(ctl, address + 4); ctl = readl(address + 4); - printk("vSMP CTL: control set to:0x%08x\n", ctl); + printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } iounmap(address); diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S deleted file mode 100644 index 103cab6aa7c0..000000000000 --- a/arch/x86/kernel/vsyscall-int80_32.S +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the old int $0x80 method. - * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. - */ - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous - -/* - * Get the common code for the sigreturn entry points. - */ -#include "vsyscall-sigreturn_32.S" diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S deleted file mode 100644 index fcf376a37f79..000000000000 --- a/arch/x86/kernel/vsyscall-note_32.S +++ /dev/null @@ -1,45 +0,0 @@ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include <linux/version.h> -#include <linux/elfnote.h> - -/* Ideally this would use UTS_NAME, but using a quoted string here - doesn't work. Remember to change this when changing the - kernel's name. */ -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END - -#ifdef CONFIG_XEN -/* - * Add a special note telling glibc's dynamic linker a fake hardware - * flavor that it will use to choose the search path for libraries in the - * same way it uses real hardware capabilities like "mmx". - * We supply "nosegneg" as the fake capability, to indicate that we - * do not like negative offsets in instructions using segment overrides, - * since we implement those inefficiently. This makes it possible to - * install libraries optimized to avoid those access patterns in someplace - * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file - * corresponding to the bits here is needed to make ldconfig work right. - * It should contain: - * hwcap 1 nosegneg - * to match the mapping of bit to name that we give here. - * - * At runtime, the fake hardware feature will be considered to be present - * if its bit is set in the mask word. So, we start with the mask 0, and - * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. - */ - -#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ - - .globl VDSO_NOTE_MASK -ELFNOTE_START(GNU, 2, "a") - .long 1 /* ncaps */ -VDSO_NOTE_MASK: - .long 0 /* mask */ - .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ -ELFNOTE_END -#endif diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S deleted file mode 100644 index a92262f41659..000000000000 --- a/arch/x86/kernel/vsyscall-sigreturn_32.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Common code for the sigreturn entry points on the vsyscall page. - * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by vsyscall-*.S to define them after the - * vsyscall entry point. The kernel assumes that the addresses of these - * routines are constant for all vsyscall implementations. - */ - -#include <asm/unistd.h> -#include <asm/asm-offsets.h> - - -/* XXX - Should these be named "_sigtramp" or something? -*/ - - .text - .org __kernel_vsyscall+32,0x90 - .globl __kernel_sigreturn - .type __kernel_sigreturn,@function -__kernel_sigreturn: -.LSTART_sigreturn: - popl %eax /* XXX does this mean it needs unwind info? */ - movl $__NR_sigreturn, %eax - int $0x80 -.LEND_sigreturn: - .size __kernel_sigreturn,.-.LSTART_sigreturn - - .balign 32 - .globl __kernel_rt_sigreturn - .type __kernel_rt_sigreturn,@function -__kernel_rt_sigreturn: -.LSTART_rt_sigreturn: - movl $__NR_rt_sigreturn, %eax - int $0x80 -.LEND_rt_sigreturn: - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - .balign 32 - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI1: - .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 -.LSTARTCIEDLSI1: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0 /* DW_CFA_nop */ - .align 4 -.LENDCIEDLSI1: - .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ -.LSTARTFDEDLSI1: - .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(SIGCONTEXT_esp+4) - do_expr(0, SIGCONTEXT_eax+4) - do_expr(1, SIGCONTEXT_ecx+4) - do_expr(2, SIGCONTEXT_edx+4) - do_expr(3, SIGCONTEXT_ebx+4) - do_expr(5, SIGCONTEXT_ebp+4) - do_expr(6, SIGCONTEXT_esi+4) - do_expr(7, SIGCONTEXT_edi+4) - do_expr(8, SIGCONTEXT_eip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(SIGCONTEXT_esp) - do_expr(0, SIGCONTEXT_eax) - do_expr(1, SIGCONTEXT_ecx) - do_expr(2, SIGCONTEXT_edx) - do_expr(3, SIGCONTEXT_ebx) - do_expr(5, SIGCONTEXT_ebp) - do_expr(6, SIGCONTEXT_esi) - do_expr(7, SIGCONTEXT_edi) - do_expr(8, SIGCONTEXT_eip) - - .align 4 -.LENDFDEDLSI1: - - .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ -.LSTARTFDEDLSI2: - .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) - do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) - do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) - do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) - do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) - do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) - do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) - do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) - do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) - - .align 4 -.LENDFDEDLSI2: - .previous diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S deleted file mode 100644 index ed879bf42995..000000000000 --- a/arch/x86/kernel/vsyscall-sysenter_32.S +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the sysenter instruction. - * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. - */ - -/* - * The caller puts arg2 in %ecx, which gets pushed. The kernel will use - * %ecx itself for arg2. The pushing is because the sysexit instruction - * (found in entry.S) requires that we clobber %ecx with the desired %esp. - * User code might expect that %ecx is unclobbered though, as it would be - * for returning via the iret instruction, so we must push and pop. - * - * The caller puts arg3 in %edx, which the sysexit instruction requires - * for %eip. Thus, exactly as for arg2, we must push and pop. - * - * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter - * instruction clobbers %esp, the user's %esp won't even survive entry - * into the kernel. We store %esp in %ebp. Code in entry.S must fetch - * arg6 from the stack. - * - * You can not use this vsyscall for the clone() syscall because the - * three dwords on the parent stack do not get copied to the child. - */ - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 - - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - jmp .Lenter_kernel - /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ -SYSENTER_RETURN: - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_ecx-.LSTART_vsyscall - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_edx-.Lpush_ecx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lenter_kernel-.Lpush_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ebp-.Lenter_kernel - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_edx-.Lpop_ebp - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ecx-.Lpop_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDEDLSI: - .previous - -/* - * Get the common code for the sigreturn entry points. - */ -#include "vsyscall-sigreturn_32.S" diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S deleted file mode 100644 index a5ab3dc4fd25..000000000000 --- a/arch/x86/kernel/vsyscall_32.S +++ /dev/null @@ -1,15 +0,0 @@ -#include <linux/init.h> - -__INITDATA - - .globl vsyscall_int80_start, vsyscall_int80_end -vsyscall_int80_start: - .incbin "arch/x86/kernel/vsyscall-int80_32.so" -vsyscall_int80_end: - - .globl vsyscall_sysenter_start, vsyscall_sysenter_end -vsyscall_sysenter_start: - .incbin "arch/x86/kernel/vsyscall-sysenter_32.so" -vsyscall_sysenter_end: - -__FINIT diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S deleted file mode 100644 index 4a8b0ed9b8fb..000000000000 --- a/arch/x86/kernel/vsyscall_32.lds.S +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. - */ -#include <asm/asm-offsets.h> - -SECTIONS -{ - . = VDSO_PRELINK_asm + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK_asm + 0x400; - - .text : { *(.text) } :text =0x90909090 - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ad4005c6d4a1..3f8242774580 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -43,7 +43,7 @@ #include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define __syscall_clobber "r11","rcx","memory" +#define __syscall_clobber "r11","cx","memory" #define __pa_vsymbol(x) \ ({unsigned long v; \ extern char __vsyscall_0; \ @@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t) long __vsyscall(2) vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - unsigned int dummy, p; + unsigned int p; unsigned long j = 0; /* Fast cache - only recompute value once per jiffies and avoid @@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) p = tcache->blob[1]; } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + native_read_tscp(&p); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); @@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu) /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. 12 bits for the CPU and 8 bits for the node. */ - d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); *d = 0x0f40000000000ULL; *d |= cpu; *d |= (node & 0xf) << 12; @@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) return NOTIFY_DONE; } -static void __init map_vsyscall(void) +void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); @@ -335,7 +335,6 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); - map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2); #endif diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 77c25b307635..a66e9c1a0537 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/desc.h> EXPORT_SYMBOL(kernel_thread); @@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - /* Export string functions. We normally rely on gcc builtin for most of these, but gcc sometimes decides not to inline them. */ #undef memcpy @@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt); EXPORT_SYMBOL(load_gs_index); EXPORT_SYMBOL(_proxy_pda); + +#ifdef CONFIG_PARAVIRT +/* Virtualized guests may want to use it */ +EXPORT_SYMBOL_GPL(cpu_gdt_descr); +#endif |