diff options
Diffstat (limited to 'arch/x86')
123 files changed, 1768 insertions, 1141 deletions
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index cb1035f2b7e9..89dbf970e058 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -39,6 +39,7 @@ setup-y += printf.o string.o tty.o video.o version.o voyager.o setup-y += video-vga.o setup-y += video-vesa.o setup-y += video-bios.o + targets += $(setup-y) hostprogs-y := tools/build @@ -50,7 +51,7 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE) # that way we can complain to the user if the CPU is insufficient. cflags-i386 := cflags-x86_64 := -m32 -CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ +KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ $(cflags-$(ARCH)) \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ @@ -61,13 +62,13 @@ CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ $(call cc-option, -fno-unit-at-a-time)) \ $(call cc-option, -fno-stack-protector) \ $(call cc-option, -mpreferred-stack-boundary=2) -AFLAGS := $(CFLAGS) -D__ASSEMBLY__ +KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ $(obj)/zImage: IMAGE_OFFSET := 0x1000 -$(obj)/zImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) +$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) $(obj)/bzImage: IMAGE_OFFSET := 0x100000 -$(obj)/bzImage: EXTRA_CFLAGS := -D__BIG_KERNEL__ -$(obj)/bzImage: EXTRA_AFLAGS := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ +$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ +$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b quiet_cmd_image = BUILD $@ diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32 index 22613c652d22..e43ff7c56e6e 100644 --- a/arch/x86/boot/compressed/Makefile_32 +++ b/arch/x86/boot/compressed/Makefile_32 @@ -11,7 +11,7 @@ EXTRA_AFLAGS := -traditional LDFLAGS_vmlinux := -T hostprogs-y := relocs -CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ +KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ -fno-strict-aliasing -fPIC \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64 index dc6b3380cc45..7801e8dd90b2 100644 --- a/arch/x86/boot/compressed/Makefile_64 +++ b/arch/x86/boot/compressed/Makefile_64 @@ -6,11 +6,11 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o -CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ +KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ -fno-strict-aliasing -fPIC -mcmodel=small \ $(call cc-option, -ffreestanding) \ $(call cc-option, -fno-stack-protector) -AFLAGS := $(CFLAGS) -D__ASSEMBLY__ +KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_x86_64 LDFLAGS_vmlinux := -T diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index cdae36435e21..e2edda255a84 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -18,18 +18,35 @@ $(obj)/syscall32_syscall.o: \ $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) # Teach kbuild about targets -targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) +targets := $(foreach F,$(addprefix vsyscall-,sysenter syscall),\ + $F.o $F.so $F.so.dbg) # The DSO images are built using a special linker script quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ + cmd_syscall = $(CC) -m32 -nostdlib -shared \ $(call ld-option, -Wl$(comma)--hash-style=sysv) \ -Wl,-soname=linux-gate.so.1 -o $@ \ -Wl,-T,$(filter-out FORCE,$^) -$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ +$(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 + +vdsos := vdso32-sysenter.so vdso32-syscall.so + +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(@:vdso32-%.so=$(obj)/vsyscall-%.so.dbg) \ + $(MODLIB)/vdso/$@ + +$(vdsos): + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: $(vdsos) diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 08781370256d..f82e1a94fcb7 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -40,7 +40,7 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); #ifdef CORE_DUMP -static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); +static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); /* * fill in the user structure for a core dump.. @@ -148,7 +148,7 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file) +static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) { mm_segment_t fs; int has_dumped = 0; @@ -168,13 +168,11 @@ static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file) /* If the size of the dump file exceeds the rlimit, then see what would happen if we wrote the stack, but not the data area. */ - if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE > - current->signal->rlim[RLIMIT_CORE].rlim_cur) + if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_dsize = 0; /* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize+1) * PAGE_SIZE > - current->signal->rlim[RLIMIT_CORE].rlim_cur) + if ((dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_ssize = 0; /* make sure we actually have a data and stack area to dump */ @@ -422,6 +420,8 @@ beyond_if: (regs)->eflags = 0x200; (regs)->cs = __USER32_CS; (regs)->ss = __USER32_DS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; set_fs(USER_DS); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c index dffd2ac72747..5027650eb273 100644 --- a/arch/x86/ia32/ia32_binfmt.c +++ b/arch/x86/ia32/ia32_binfmt.c @@ -112,11 +112,8 @@ struct elf_prpsinfo char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ }; -#define __STR(x) #x -#define STR(x) __STR(x) - #define _GET_SEG(x) \ - ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; }) + ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; }) /* Assumes current==process to be dumped */ #define ELF_CORE_COPY_REGS(pr_reg, regs) \ @@ -188,6 +185,7 @@ elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpr } #define ELF_CORE_COPY_XFPREGS 1 +#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG static inline int elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) { @@ -291,7 +289,6 @@ static void elf32_init(struct pt_regs *regs) static ctl_table abi_table2[] = { { - .ctl_name = 99, .procname = "vsyscall32", .data = &sysctl_vsyscall32, .maxlen = sizeof(int), diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index 2e1869ec4db4..7b3342e5aab5 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -9,8 +9,6 @@ #include <linux/ipc.h> #include <linux/compat.h> -#include <asm/ipc.h> - asmlinkage long sys32_ipc(u32 call, int first, int second, int third, compat_uptr_t ptr, u32 fifth) diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c index 4a233ad6269c..f52770ef0ee3 100644 --- a/arch/x86/ia32/ptrace32.c +++ b/arch/x86/ia32/ptrace32.c @@ -228,6 +228,8 @@ static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) return ret; } +#define COMPAT_GDT_ENTRY_TLS_MIN 6 + asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) { struct task_struct *child; @@ -246,8 +248,6 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_SYSCALL: case PTRACE_OLDSETOPTIONS: case PTRACE_SETOPTIONS: - case PTRACE_SET_THREAD_AREA: - case PTRACE_GET_THREAD_AREA: return sys_ptrace(request, pid, addr, data); default: @@ -271,6 +271,12 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_SETSIGINFO: case PTRACE_GETSIGINFO: return ptrace32_siginfo(request, pid, addr, data); + + case PTRACE_SET_THREAD_AREA: + case PTRACE_GET_THREAD_AREA: + return sys_ptrace(request, pid, + addr + GDT_ENTRY_TLS_MIN - COMPAT_GDT_ENTRY_TLS_MIN, + data); } child = ptrace_get_task_struct(pid); diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore index 40836ad9079c..4ea38a39aed4 100644 --- a/arch/x86/kernel/.gitignore +++ b/arch/x86/kernel/.gitignore @@ -1 +1,2 @@ vsyscall.lds +vsyscall_32.lds diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 45855c97923e..38573340b143 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,3 +3,7 @@ include ${srctree}/arch/x86/kernel/Makefile_32 else include ${srctree}/arch/x86/kernel/Makefile_64 endif + +# Workaround to delete .lds files with make clean +# The problem is that we do not enter Makefile_32 with make clean. +clean-files := vsyscall*.lds vsyscall*.so diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index 7ff02063b858..a3fa11f8f460 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 @@ -51,7 +51,7 @@ obj-$(CONFIG_SCx200) += scx200_32.o # We must build both images before we can assemble it. # Note: kbuild does not track this dependency due to usage of .incbin $(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so -targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) +targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so) targets += vsyscall-note_32.o vsyscall_32.lds # The DSO images are built using a special linker script. diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index f22ba8534d26..a97313b1270e 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -11,7 +11,7 @@ # # If physical address of wakeup_code is 0x12345, BIOS should call us with # cs = 0x1234, eip = 0x05 -# +# #define BEEP \ inb $97, %al; \ @@ -52,7 +52,6 @@ wakeup_code: BEEP 1: mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board - movw $0x0e00 + 'S', %fs:(0x12) pushl $0 # Kill any dangerous flags popfl @@ -90,9 +89,6 @@ wakeup_code: # make sure %cr4 is set correctly (features, etc) movl real_save_cr4 - wakeup_code, %eax movl %eax, %cr4 - movw $0xb800, %ax - movw %ax,%fs - movw $0x0e00 + 'i', %fs:(0x12) # need a gdt -- use lgdtl to force 32-bit operands, in case # the GDT is located past 16 megabytes. @@ -102,8 +98,6 @@ wakeup_code: movl %eax, %cr0 jmp 1f 1: - movw $0x0e00 + 'n', %fs:(0x14) - movl real_magic - wakeup_code, %eax cmpl $0x12345678, %eax jne bogus_real_magic @@ -122,13 +116,11 @@ real_save_cr4: .long 0 real_magic: .long 0 video_mode: .long 0 realmode_flags: .long 0 -beep_flags: .long 0 real_efer_save_restore: .long 0 real_save_efer_edx: .long 0 real_save_efer_eax: .long 0 bogus_real_magic: - movw $0x0e00 + 'B', %fs:(0x12) jmp bogus_real_magic /* This code uses an extended set of video mode numbers. These include: @@ -194,7 +186,6 @@ wakeup_pmode_return: movw %ax, %es movw %ax, %fs movw %ax, %gs - movw $0x0e00 + 'u', 0xb8016 # reload the gdt, as we need the full 32 bit address lgdt saved_gdt @@ -218,7 +209,6 @@ wakeup_pmode_return: jmp *%eax bogus_magic: - movw $0x0e00 + 'B', 0xb8018 jmp bogus_magic diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8b4357e1efe0..55608ec2ed72 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -41,7 +41,6 @@ wakeup_code: # Running in *copy* of this code, somewhere in low 1MB. - movb $0xa1, %al ; outb %al, $0x80 cli cld # setup data segment @@ -65,11 +64,6 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic - call verify_cpu # Verify the cpu supports long - # mode - testl %eax, %eax - jnz no_longmode - testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 @@ -84,12 +78,6 @@ wakeup_code: call mode_set 1: - movw $0xb800, %ax - movw %ax,%fs - movw $0x0e00 + 'L', %fs:(0x10) - - movb $0xa2, %al ; outb %al, $0x80 - mov %ds, %ax # Find 32bit wakeup_code addr movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) shll $4, %esi @@ -117,14 +105,10 @@ wakeup_32_vector: .code32 wakeup_32: # Running in this code, but at low address; paging is not yet turned on. - movb $0xa5, %al ; outb %al, $0x80 movl $__KERNEL_DS, %eax movl %eax, %ds - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - /* * Prepare for entering 64bits mode */ @@ -200,16 +184,11 @@ wakeup_long64: */ lgdt cpu_gdt_descr - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax jne bogus_64_magic - movw $0x0e00 + 'u', %ds:(0xb8016) - nop nop movw $__KERNEL_DS, %ax @@ -220,13 +199,11 @@ wakeup_long64: movw %ax, %gs movq saved_rsp, %rsp - movw $0x0e00 + 'x', %ds:(0xb8018) movq saved_rbx, %rbx movq saved_rdi, %rdi movq saved_rsi, %rsi movq saved_rbp, %rbp - movw $0x0e00 + '!', %ds:(0xb801a) movq saved_rip, %rax jmp *%rax @@ -256,21 +233,12 @@ realmode_flags: .quad 0 .code16 bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic .code64 bogus_64_magic: - movb $0xb3,%al ; outb %al,$0x80 jmp bogus_64_magic -.code16 -no_longmode: - movb $0xbc,%al ; outb %al,$0x80 - jmp no_longmode - -#include "../verify_cpu_64.S" - /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 11b03d3c6fda..3bd2688bd443 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -63,11 +63,11 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ -asm("\t.data\nintelnops: " +asm("\t.section .rodata, \"a\"\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8); -extern unsigned char intelnops[]; -static unsigned char *intel_nops[ASM_NOP_MAX+1] = { +extern const unsigned char intelnops[]; +static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, intelnops + 1, @@ -81,11 +81,11 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = { #endif #ifdef K8_NOP1 -asm("\t.data\nk8nops: " +asm("\t.section .rodata, \"a\"\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 K8_NOP7 K8_NOP8); -extern unsigned char k8nops[]; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { +extern const unsigned char k8nops[]; +static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, k8nops + 1, @@ -99,11 +99,11 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = { #endif #ifdef K7_NOP1 -asm("\t.data\nk7nops: " +asm("\t.section .rodata, \"a\"\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 K7_NOP7 K7_NOP8); -extern unsigned char k7nops[]; -static unsigned char *k7_nops[ASM_NOP_MAX+1] = { +extern const unsigned char k7nops[]; +static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, k7nops + 1, @@ -116,28 +116,49 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = { }; #endif +#ifdef P6_NOP1 +asm("\t.section .rodata, \"a\"\np6nops: " + P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 + P6_NOP7 P6_NOP8); +extern const unsigned char p6nops[]; +static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { + NULL, + p6nops, + p6nops + 1, + p6nops + 1 + 2, + p6nops + 1 + 2 + 3, + p6nops + 1 + 2 + 3 + 4, + p6nops + 1 + 2 + 3 + 4 + 5, + p6nops + 1 + 2 + 3 + 4 + 5 + 6, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +#endif + #ifdef CONFIG_X86_64 extern char __vsyscall_0; -static inline unsigned char** find_nop_table(void) +static inline const unsigned char*const * find_nop_table(void) { - return k8_nops; + return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || + boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; } #else /* CONFIG_X86_64 */ -static struct nop { +static const struct nop { int cpuid; - unsigned char **noptable; + const unsigned char *const *noptable; } noptypes[] = { { X86_FEATURE_K8, k8_nops }, { X86_FEATURE_K7, k7_nops }, + { X86_FEATURE_P4, p6_nops }, + { X86_FEATURE_P3, p6_nops }, { -1, NULL } }; -static unsigned char** find_nop_table(void) +static const unsigned char*const * find_nop_table(void) { - unsigned char **noptable = intel_nops; + const unsigned char *const *noptable = intel_nops; int i; for (i = 0; noptypes[i].cpuid >= 0; i++) { @@ -154,7 +175,7 @@ static unsigned char** find_nop_table(void) /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void add_nops(void *insns, unsigned int len) { - unsigned char **noptable = find_nop_table(); + const unsigned char *const *noptable = find_nop_table(); while (len > 0) { unsigned int noplen = len; @@ -369,8 +390,8 @@ void apply_paravirt(struct paravirt_patch_site *start, BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ memcpy(insnbuf, p->instr, p->len); - used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, - (unsigned long)p->instr, p->len); + used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, + (unsigned long)p->instr, p->len); BUG_ON(used > p->len); @@ -415,9 +436,6 @@ void __init alternative_instructions(void) alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } - free_init_pages("SMP alternatives", - (unsigned long)__smp_locks, - (unsigned long)__smp_locks_end); } else { alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, @@ -428,6 +446,11 @@ void __init alternative_instructions(void) apply_paravirt(__parainstructions, __parainstructions_end); local_irq_restore(flags); + if (smp_alt_once) + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); + restart_nmi(); #ifdef CONFIG_X86_MCE restart_mce(); diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3d67ae18d762..793341fffc81 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1277,6 +1277,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); + __get_cpu_var(irq_stat).irq_spurious_count++; irq_exit(); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 09b82093bc75..f47bc493dba9 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -974,15 +974,12 @@ void __init setup_boot_APIC_clock (void) */ void __cpuinit check_boot_apic_timer_broadcast(void) { - struct clock_event_device *levt = &per_cpu(lapic_events, boot_cpu_id); - if (!disable_apic_timer || (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) return; printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; - levt->features |= CLOCK_EVT_FEAT_DUMMY; local_irq_enable(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); @@ -1143,6 +1140,7 @@ asmlinkage void smp_spurious_interrupt(void) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); + add_pda(irq_spurious_count, 1); irq_exit(); } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 8029742c0fc1..f1b7cdda82b3 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -116,12 +116,14 @@ void foo(void) #ifdef CONFIG_PARAVIRT BLANK(); - OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); - OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); - OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); - OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); - OFFSET(PARAVIRT_iret, paravirt_ops, iret); - OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif #ifdef CONFIG_XEN diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dcf6bbb1c7c0..5f8af875f457 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -4,6 +4,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/apic.h> +#include <asm/mach_apic.h> #include "cpu.h" @@ -45,13 +46,17 @@ static __cpuinit int amd_apic_timer_broken(void) case CPUID_XFAM_10H: case CPUID_XFAM_11H: rdmsr(MSR_K8_ENABLE_C1E, lo, hi); - if (lo & ENABLE_C1E_MASK) + if (lo & ENABLE_C1E_MASK) { + if (smp_processor_id() != boot_cpu_physical_apicid) + printk(KERN_INFO "AMD C1E detected late. " + " Force timer broadcast.\n"); return 1; - break; - default: - /* err on the side of caution */ + } + break; + default: + /* err on the side of caution */ return 1; - } + } return 0; } #endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ffd01e5dcb52..2ca43ba32bc0 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -595,7 +595,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) dmi_check_system(sw_any_bug_dmi_table); if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) { policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; - policy->cpus = cpu_core_map[cpu]; + policy->cpus = per_cpu(cpu_core_map, cpu); } #endif diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 8eb414b906d2..793eae854f4f 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -200,7 +200,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) unsigned int i; #ifdef CONFIG_SMP - policy->cpus = cpu_sibling_map[policy->cpu]; + policy->cpus = per_cpu(cpu_sibling_map, policy->cpu); #endif /* Errata workaround */ diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 7decd6a50ffa..f3686a5f2308 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -565,7 +565,7 @@ static unsigned int powernow_get(unsigned int cpu) } -static int __init acer_cpufreq_pst(struct dmi_system_id *d) +static int __init acer_cpufreq_pst(const struct dmi_system_id *d) { printk(KERN_WARNING "%s laptop with broken PST tables in BIOS detected.\n", d->ident); printk(KERN_WARNING "You need to downgrade to 3A21 (09/09/2002), or try a newer BIOS than 3A71 (01/20/2003)\n"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index b273b69cfddf..c06ac680c9ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -57,7 +57,7 @@ static struct powernow_k8_data *powernow_data[NR_CPUS]; static int cpu_family = CPU_OPTERON; #ifndef CONFIG_SMP -static cpumask_t cpu_core_map[1]; +DEFINE_PER_CPU(cpumask_t, cpu_core_map); #endif /* Return a frequency in MHz, given an input fid */ @@ -667,7 +667,7 @@ static int fill_powernow_table(struct powernow_k8_data *data, struct pst_s *pst, dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); data->powernow_table = powernow_table; - if (first_cpu(cpu_core_map[data->cpu]) == data->cpu) + if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) print_basics(data); for (j = 0; j < data->numps; j++) @@ -821,7 +821,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* fill in data */ data->numps = data->acpi_data.state_count; - if (first_cpu(cpu_core_map[data->cpu]) == data->cpu) + if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) print_basics(data); powernow_k8_acpi_pst_values(data, 0); @@ -1214,7 +1214,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) if (cpu_family == CPU_HW_PSTATE) pol->cpus = cpumask_of_cpu(pol->cpu); else - pol->cpus = cpu_core_map[pol->cpu]; + pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); /* Take a crude guess here. @@ -1281,7 +1281,7 @@ static unsigned int powernowk8_get (unsigned int cpu) cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; - data = powernow_data[first_cpu(cpu_core_map[cpu])]; + data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; if (!data) return -EINVAL; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 36685e8f7be1..14d68aa301ee 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -322,7 +322,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) /* only run on CPU to be set, or on its sibling */ #ifdef CONFIG_SMP - policy->cpus = cpu_sibling_map[policy->cpu]; + policy->cpus = per_cpu(cpu_sibling_map, policy->cpu); #endif cpus_allowed = current->cpus_allowed; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index dc4e08147b1f..cc8c501b9f39 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <asm/processor.h> +#include <asm/pgtable.h> #include <asm/msr.h> #include <asm/uaccess.h> @@ -19,8 +20,6 @@ #include <mach_apic.h> #endif -extern int trap_init_f00f_bug(void); - #ifdef CONFIG_X86_INTEL_USERCOPY /* * Alignment at which movsl is preferred for bulk memory copies. @@ -95,6 +94,20 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) return 1; } +#ifdef CONFIG_X86_F00F_BUG +static void __cpuinit trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); +} +#endif + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index db6c25aa5776..297a24116949 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -170,15 +170,15 @@ union l3_cache { unsigned val; }; -static const unsigned short assocs[] = { +static unsigned short assocs[] __cpuinitdata = { [1] = 1, [2] = 2, [4] = 4, [6] = 8, [8] = 16, [0xa] = 32, [0xb] = 48, [0xc] = 64, [0xf] = 0xffff // ?? }; -static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; +static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; +static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -493,12 +493,17 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {} -static void __init cache_remove_shared_cpu_map(unsigned int cpu, int index) {} +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} #endif static void free_cache_attributes(unsigned int cpu) { + int i; + + for (i = 0; i < num_cache_leaves; i++) + cache_remove_shared_cpu_map(cpu, i); + kfree(cpuid4_info[cpu]); cpuid4_info[cpu] = NULL; } @@ -506,8 +511,8 @@ static void free_cache_attributes(unsigned int cpu) static int __cpuinit detect_cache_attributes(unsigned int cpu) { struct _cpuid4_info *this_leaf; - unsigned long j; - int retval; + unsigned long j; + int retval; cpumask_t oldmask; if (num_cache_leaves == 0) @@ -524,19 +529,26 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) goto out; /* Do cpuid and store the results */ - retval = 0; for (j = 0; j < num_cache_leaves; j++) { this_leaf = CPUID4_INFO_IDX(cpu, j); retval = cpuid4_cache_lookup(j, this_leaf); - if (unlikely(retval < 0)) + if (unlikely(retval < 0)) { + int i; + + for (i = 0; i < j; i++) + cache_remove_shared_cpu_map(cpu, i); break; + } cache_shared_cpu_map_setup(cpu, j); } set_cpus_allowed(current, oldmask); out: - if (retval) - free_cache_attributes(cpu); + if (retval) { + kfree(cpuid4_info[cpu]); + cpuid4_info[cpu] = NULL; + } + return retval; } @@ -669,7 +681,7 @@ static struct kobj_type ktype_percpu_entry = { .sysfs_ops = &sysfs_ops, }; -static void cpuid4_cache_sysfs_exit(unsigned int cpu) +static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) { kfree(cache_kobject[cpu]); kfree(index_kobject[cpu]); @@ -680,13 +692,14 @@ static void cpuid4_cache_sysfs_exit(unsigned int cpu) static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) { + int err; if (num_cache_leaves == 0) return -ENOENT; - detect_cache_attributes(cpu); - if (cpuid4_info[cpu] == NULL) - return -ENOENT; + err = detect_cache_attributes(cpu); + if (err) + return err; /* Allocate all required memory */ cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); @@ -705,13 +718,15 @@ err_out: return -ENOMEM; } +static cpumask_t cache_dev_map = CPU_MASK_NONE; + /* Add/Remove cache interface for CPU device */ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) { unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; - int retval = 0; + int retval; retval = cpuid4_cache_sysfs_init(cpu); if (unlikely(retval < 0)) @@ -721,6 +736,10 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) kobject_set_name(cache_kobject[cpu], "%s", "cache"); cache_kobject[cpu]->ktype = &ktype_percpu_entry; retval = kobject_register(cache_kobject[cpu]); + if (retval < 0) { + cpuid4_cache_sysfs_exit(cpu); + return retval; + } for (i = 0; i < num_cache_leaves; i++) { this_object = INDEX_KOBJECT_PTR(cpu,i); @@ -740,6 +759,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) break; } } + if (!retval) + cpu_set(cpu, cache_dev_map); + return retval; } @@ -750,13 +772,14 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) if (cpuid4_info[cpu] == NULL) return; - for (i = 0; i < num_cache_leaves; i++) { - cache_remove_shared_cpu_map(cpu, i); + if (!cpu_isset(cpu, cache_dev_map)) + return; + cpu_clear(cpu, cache_dev_map); + + for (i = 0; i < num_cache_leaves; i++) kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); - } kobject_unregister(cache_kobject[cpu]); cpuid4_cache_sysfs_exit(cpu); - return; } static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, @@ -781,7 +804,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { - .notifier_call = cacheinfo_cpu_callback, + .notifier_call = cacheinfo_cpu_callback, }; static int __cpuinit cache_sysfs_init(void) @@ -791,13 +814,15 @@ static int __cpuinit cache_sysfs_init(void) if (num_cache_leaves == 0) return 0; - register_hotcpu_notifier(&cacheinfo_cpu_notifier); - for_each_online_cpu(i) { - cacheinfo_cpu_callback(&cacheinfo_cpu_notifier, CPU_ONLINE, - (void *)(long)i); - } + int err; + struct sys_device *sys_dev = get_cpu_sysdev(i); + err = cache_add_dev(sys_dev); + if (err) + return err; + } + register_hotcpu_notifier(&cacheinfo_cpu_notifier); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index 1509edfb2313..be4dabfee1f5 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -61,6 +61,7 @@ fastcall void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); vendor_thermal_interrupt(regs); + __get_cpu_var(irq_stat).irq_thermal_count++; irq_exit(); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1203dc5ab87a..24885be5c48c 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -131,17 +131,19 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, { unsigned int cpu = (unsigned long)hcpu; struct sys_device *sys_dev; - int err; + int err = 0; sys_dev = get_cpu_sysdev(cpu); switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(sys_dev); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); @@ -149,10 +151,10 @@ static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, mutex_unlock(&therm_cpu_lock); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } -static struct notifier_block thermal_throttle_cpu_notifier = +static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = { .notifier_call = thermal_throttle_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index c48b6fea5ab4..5e4be30ff903 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -738,13 +738,7 @@ void mtrr_ap_init(void) */ void mtrr_save_state(void) { - int cpu = get_cpu(); - - if (cpu == 0) - mtrr_save_fixed_ranges(NULL); - else - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); - put_cpu(); + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); } static int __init mtrr_init_finialize(void) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 93fecd4b03de..54cdbf1a40f1 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -34,7 +34,7 @@ struct wd_ops { u64 checkbit; }; -static struct wd_ops *wd_ops; +static const struct wd_ops *wd_ops; /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) @@ -317,7 +317,7 @@ static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); } -static struct wd_ops k7_wd_ops = { +static const struct wd_ops k7_wd_ops = { .reserve = single_msr_reserve, .unreserve = single_msr_unreserve, .setup = setup_k7_watchdog, @@ -380,7 +380,7 @@ static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); } -static struct wd_ops p6_wd_ops = { +static const struct wd_ops p6_wd_ops = { .reserve = single_msr_reserve, .unreserve = single_msr_unreserve, .setup = setup_p6_watchdog, @@ -532,7 +532,7 @@ static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); } -static struct wd_ops p4_wd_ops = { +static const struct wd_ops p4_wd_ops = { .reserve = p4_reserve, .unreserve = p4_unreserve, .setup = setup_p4_watchdog, @@ -550,6 +550,8 @@ static struct wd_ops p4_wd_ops = { #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK +static struct wd_ops intel_arch_wd_ops; + static int setup_intel_arch_watchdog(unsigned nmi_hz) { unsigned int ebx; @@ -591,11 +593,11 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; //unused - wd_ops->checkbit = 1ULL << (eax.split.bit_width - 1); + intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; } -static struct wd_ops intel_arch_wd_ops = { +static struct wd_ops intel_arch_wd_ops __read_mostly = { .reserve = single_msr_reserve, .unreserve = single_msr_unreserve, .setup = setup_intel_arch_watchdog, diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 1e31b6caffb1..879a0f789b1e 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -122,7 +122,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_X86_HT if (c->x86_max_cores * smp_num_siblings > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n])); + seq_printf(m, "siblings\t: %d\n", + cpus_weight(per_cpu(cpu_core_map, n))); seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index f4548c93ccf5..70dcf912d9fb 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -43,8 +43,6 @@ static struct class *cpuid_class; -#ifdef CONFIG_SMP - struct cpuid_command { u32 reg; u32 *data; @@ -62,25 +60,11 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data) { struct cpuid_command cmd; - preempt_disable(); - if (cpu == smp_processor_id()) { - cpuid(reg, &data[0], &data[1], &data[2], &data[3]); - } else { - cmd.reg = reg; - cmd.data = data; + cmd.reg = reg; + cmd.data = data; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); - } - preempt_enable(); + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); } -#else /* ! CONFIG_SMP */ - -static inline void do_cpuid(int cpu, u32 reg, u32 * data) -{ - cpuid(reg, &data[0], &data[1], &data[2], &data[3]); -} - -#endif /* ! CONFIG_SMP */ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) { @@ -150,7 +134,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int cpuid_device_create(int i) +static int __cpuinit cpuid_device_create(int i) { int err = 0; struct device *dev; @@ -161,7 +145,9 @@ static int cpuid_device_create(int i) return err; } -static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 0f4d5e209e9b..e422b8159f69 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -24,7 +24,7 @@ #include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> -#include <asm/bootsetup.h> +#include <asm/setup.h> #include <asm/sections.h> struct e820map e820; @@ -68,10 +68,15 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) /* initrd */ #ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START && last >= INITRD_START && - addr < INITRD_START+INITRD_SIZE) { - *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); - return 1; + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image+ramdisk_size; + + if (last >= ramdisk_image && addr < ramdisk_end) { + *addrp = PAGE_ALIGN(ramdisk_end); + return 1; + } } #endif /* kernel code */ @@ -594,8 +599,8 @@ void __init setup_memory_region(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map("BIOS-e820"); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index fd9aff3f3890..b7d6c23f2871 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -6,15 +6,10 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> +#include <asm/setup.h> #include <xen/hvc-console.h> /* Simple VGA output */ - -#ifdef __i386__ -#include <asm/setup.h> -#else -#include <asm/bootsetup.h> -#endif #define VGABASE (__ISA_IO_base + 0xb8000) static int max_ypos = 25, max_xpos = 80; @@ -234,10 +229,10 @@ static int __init setup_early_printk(char *buf) early_serial_init(buf); early_console = &early_serial_console; } else if (!strncmp(buf, "vga", 3) - && SCREEN_INFO.orig_video_isVGA == 1) { - max_xpos = SCREEN_INFO.orig_video_cols; - max_ypos = SCREEN_INFO.orig_video_lines; - current_ypos = SCREEN_INFO.orig_y; + && boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; early_console = &early_vga_console; } else if (!strncmp(buf, "simnow", 6)) { simnow_init(buf + 6); diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index 2452c6fbe992..b42558c48e9d 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -331,11 +331,13 @@ void __init efi_init(void) memset(&efi, 0, sizeof(efi) ); memset(&efi_phys, 0, sizeof(efi_phys)); - efi_phys.systab = EFI_SYSTAB; - memmap.phys_map = EFI_MEMMAP; - memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE; - memmap.desc_version = EFI_MEMDESC_VERSION; - memmap.desc_size = EFI_MEMDESC_SIZE; + efi_phys.systab = + (efi_system_table_t *)boot_params.efi_info.efi_systab; + memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; + memmap.nr_map = boot_params.efi_info.efi_memmap_size/ + boot_params.efi_info.efi_memdesc_size; + memmap.desc_version = boot_params.efi_info.efi_memdesc_version; + memmap.desc_size = boot_params.efi_info.efi_memdesc_size; efi.systab = (efi_system_table_t *) boot_ioremap((unsigned long) efi_phys.systab, @@ -446,7 +448,8 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the runtime service table!\n"); /* Map the EFI memory map for use until paging_init() */ - memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE); + memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap, + boot_params.efi_info.efi_memmap_size); if (memmap.map == NULL) printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 290b7bc82da3..dc7f938e5015 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -251,6 +251,7 @@ check_userspace: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -338,6 +339,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx @@ -377,6 +379,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) # store the return value syscall_exit: + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -434,7 +437,7 @@ ldt_ss: * is still available to implement the setting of the high * 16-bits in the INTERRUPT_RETURN paravirt-op. */ - cmpl $0, paravirt_ops+PARAVIRT_enabled + cmpl $0, pv_info+PARAVIRT_enabled jne restore_nocheck #endif @@ -467,6 +470,7 @@ work_pending: jz work_notifysig work_resched: call schedule + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1d232e5f5658..3a058bb16409 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -244,6 +244,7 @@ ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ sysret_check: + LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) cli TRACE_IRQS_OFF @@ -333,6 +334,7 @@ int_ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ int_with_check: + LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -544,11 +546,13 @@ exit_intr: retint_with_reschedule: movl $_TIF_WORK_MASK,%edi retint_check: + LOCKDEP_SYS_EXIT_IRQ movl threadinfo_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz retint_careful -retint_swapgs: + +retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ @@ -557,7 +561,7 @@ retint_swapgs: swapgs jmp restore_args -retint_restore_args: +retint_restore_args: /* return to kernel space */ cli /* * The iretq could re-enable interrupts: @@ -866,26 +870,21 @@ error_sti: movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) call *%rax - /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ -error_exit: - movl %ebx,%eax + /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +error_exit: + movl %ebx,%eax RESTORE_REST cli TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax jne retint_kernel + LOCKDEP_SYS_EXIT_IRQ movl threadinfo_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful - /* - * The iret might restore flags: - */ - TRACE_IRQS_IRETQ - swapgs - RESTORE_ARGS 0,8,0 - jmp iret_label + jmp retint_swapgs CFI_ENDPROC error_kernelspace: @@ -989,7 +988,7 @@ child_rip: movq %rsi, %rdi call *%rax # exit - xorl %edi, %edi + mov %eax, %edi call do_exit CFI_ENDPROC ENDPROC(child_rip) diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 47496a40e84f..4ae03e3e8294 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -29,8 +29,6 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - struct genapic __read_mostly *genapic = &apic_flat; /* diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index ecb01eefdd27..91c7526768ee 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -52,7 +52,6 @@ static void flat_init_apic_ldr(void) num = smp_processor_id(); id = 1UL << num; - x86_cpu_to_log_apicid[num] = id; apic_write(APIC_DFR, APIC_DFR_FLAT); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8561f626edad..a7eee0a4751d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -14,7 +14,6 @@ #include <asm/processor.h> #include <asm/proto.h> #include <asm/smp.h> -#include <asm/bootsetup.h> #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgtable.h> @@ -36,26 +35,15 @@ static void __init clear_bss(void) (unsigned long) __bss_stop - (unsigned long) __bss_start); } -#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x20 -#define OLD_CL_MAGIC 0xA33F -#define OLD_CL_OFFSET 0x22 - static void __init copy_bootdata(char *real_mode_data) { - unsigned long new_data; char * command_line; - memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); - if (!new_data) { - if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { - return; - } - new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); + memcpy(&boot_params, real_mode_data, sizeof boot_params); + if (boot_params.hdr.cmd_line_ptr) { + command_line = __va(boot_params.hdr.cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } - command_line = __va(new_data); - memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } void __init x86_64_start_kernel(char * real_mode_data) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9150ca9b5f80..39677965e161 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -51,6 +51,15 @@ */ LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) +/* + * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate + * pagetables from above the 16MB DMA limit, so we'll have to set + * up pagetables 16MB more (worst-case): + */ +#ifdef CONFIG_DEBUG_PAGEALLOC +LOW_PAGES = LOW_PAGES + 0x1000000 +#endif + #if PTRS_PER_PMD > 1 PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD #else @@ -443,6 +452,7 @@ early_page_fault: early_fault: cld #ifdef CONFIG_PRINTK + pusha movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es @@ -534,8 +544,15 @@ int_msg: .asciz "Unknown interrupt or fault at EIP %p %p %p\n" fault_msg: - .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" - .asciz "Stack: %p %p %p %p %p %p %p %p\n" + .ascii \ +/* fault info: */ "BUG: Int %d: CR2 %p\n" \ +/* pusha regs: */ " EDI %p ESI %p EBP %p ESP %p\n" \ + " EBX %p EDX %p ECX %p EAX %p\n" \ +/* fault frame: */ " err %p EIP %p CS %p flg %p\n" \ + \ + "Stack: %p %p %p %p %p %p %p %p\n" \ + " %p %p %p %p %p %p %p %p\n" \ + " %p %p %p %p %p %p %p %p\n" #include "../../x86/xen/xen-head.S" diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index e3d4b73bfdb0..edd39ccf139e 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,4 +1,5 @@ #include <linux/module.h> +#include <asm/semaphore.h> #include <asm/checksum.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index 679bb33acbf1..d34a10cc13a7 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -349,7 +349,11 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) * New motherboards sometimes make IRQ 13 be a PCI interrupt, * so allow interrupt sharing. */ -static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; +static struct irqaction fpu_irq = { + .handler = math_error_irq, + .mask = CPU_MASK_NONE, + .name = "fpu", +}; void __init init_ISA_irqs (void) { diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index eb72976cc13c..3f27ea0b9816 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -395,7 +395,11 @@ device_initcall(i8259A_init_sysfs); * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... IRQ0_VECTOR - 1] = -1, [IRQ0_VECTOR] = 0, diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index e2f4a1c68547..5f10c7189534 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -378,7 +378,7 @@ static struct irq_cpu_info { #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) static cpumask_t balance_irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL @@ -598,7 +598,7 @@ tryanotherirq: * (A+B)/2 vs B */ load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { if (load > CPU_IRQ(j)) { /* This won't change cpu_sibling_map[min_loaded] */ load = CPU_IRQ(j); @@ -1296,6 +1296,11 @@ static void __init setup_IO_APIC_irqs(void) continue; } + if (!first_notcon) { + apic_printk(APIC_VERBOSE, " not connected.\n"); + first_notcon = 1; + } + entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 966fa1062491..1c2c7bf6a9d3 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -875,6 +875,10 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); continue; } + if (!first_notcon) { + apic_printk(APIC_VERBOSE, " not connected.\n"); + first_notcon = 1; + } irq = pin_2_irq(idx, apic, pin); add_pin_to_irq(irq, apic, pin); @@ -885,7 +889,7 @@ static void __init setup_IO_APIC_irqs(void) } if (!first_notcon) - apic_printk(APIC_VERBOSE," not connected.\n"); + apic_printk(APIC_VERBOSE, " not connected.\n"); } /* @@ -1845,7 +1849,7 @@ static struct sysdev_class ioapic_sysdev_class = { static int __init ioapic_init_sysfs(void) { struct sys_device * dev; - int i, size, error = 0; + int i, size, error; error = sysdev_class_register(&ioapic_sysdev_class); if (error) @@ -1854,12 +1858,11 @@ static int __init ioapic_init_sysfs(void) for (i = 0; i < nr_ioapics; i++ ) { size = sizeof(struct sys_device) + nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); if (!mp_ioapic_data[i]) { printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); continue; } - memset(mp_ioapic_data[i], 0, size); dev = &mp_ioapic_data[i]->dev; dev->id = i; dev->cls = &ioapic_sysdev_class; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e173b763f148..d3fde94f7345 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -255,9 +255,17 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { + unsigned any_count = 0; + spin_lock_irqsave(&irq_desc[i].lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_cpu(j).irqs[i]; +#endif action = irq_desc[i].action; - if (!action) + if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); #ifndef CONFIG_SMP @@ -268,10 +276,12 @@ int show_interrupts(struct seq_file *p, void *v) #endif seq_printf(p, " %8s", irq_desc[i].chip->name); seq_printf(p, "-%-8s", irq_desc[i].name); - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } seq_putc(p, '\n'); skip: @@ -280,14 +290,41 @@ skip: seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); - seq_putc(p, '\n'); + seq_printf(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); - seq_putc(p, '\n'); + seq_printf(p, " Local timer interrupts\n"); #endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_call_count); + seq_printf(p, " function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 865669efc540..6b5c730d67b9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -62,9 +62,17 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { + unsigned any_count = 0; + spin_lock_irqsave(&irq_desc[i].lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_cpu(j).irqs[i]; +#endif action = irq_desc[i].action; - if (!action) + if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); #ifndef CONFIG_SMP @@ -76,9 +84,11 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, " %8s", irq_desc[i].chip->name); seq_printf(p, "-%-8s", irq_desc[i].name); - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); @@ -86,11 +96,37 @@ skip: seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_putc(p, '\n'); + seq_printf(p, " Non-maskable interrupts\n"); seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_putc(p, '\n'); + seq_printf(p, " Local timer interrupts\n"); +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); + seq_printf(p, " function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); } return 0; diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c index c2d03e96ae9f..90f778c04b3f 100644 --- a/arch/x86/kernel/kprobes_32.c +++ b/arch/x86/kernel/kprobes_32.c @@ -41,6 +41,13 @@ void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + /* insert a jmp code */ static __always_inline void set_jmp_op(void *from, void *to) { @@ -557,6 +564,12 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->eflags |= kcb->kprobe_saved_eflags; +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + if (raw_irqs_disabled_flags(regs->eflags)) + trace_hardirqs_off(); + else + trace_hardirqs_on(); +#endif /*Restore back the original saved kprobes variables and continue. */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -578,7 +591,7 @@ out: return 1; } -static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -660,7 +673,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_GPF: - case DIE_PAGE_FAULT: /* kprobe_running() needs smp_processor_id() */ preempt_disable(); if (kprobe_running() && @@ -694,6 +706,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); regs->eflags &= ~IF_MASK; + trace_hardirqs_off(); regs->eip = (unsigned long)(jp->entry); return 1; } diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c index 1df17a0ec0c9..681b801c5e26 100644 --- a/arch/x86/kernel/kprobes_64.c +++ b/arch/x86/kernel/kprobes_64.c @@ -48,6 +48,13 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + /* * returns non-zero if opcode modifies the interrupt flag. */ @@ -544,6 +551,12 @@ int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->eflags |= kcb->kprobe_saved_rflags; +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + if (raw_irqs_disabled_flags(regs->eflags)) + trace_hardirqs_off(); + else + trace_hardirqs_on(); +#endif /* Restore the original saved kprobes variables and continue. */ if (kcb->kprobe_status == KPROBE_REENTER) { @@ -651,7 +664,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_GPF: - case DIE_PAGE_FAULT: /* kprobe_running() needs smp_processor_id() */ preempt_disable(); if (kprobe_running() && @@ -684,6 +696,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr)); regs->eflags &= ~IF_MASK; + trace_hardirqs_off(); regs->rip = (unsigned long)(jp->entry); return 1; } diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c index a8b18421863a..9ff90a27c45f 100644 --- a/arch/x86/kernel/ldt_32.c +++ b/arch/x86/kernel/ldt_32.c @@ -92,13 +92,13 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) struct mm_struct * old_mm; int retval = 0; - init_MUTEX(&mm->context.sem); + mutex_init(&mm->context.lock); mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); + mutex_lock(&old_mm->context.lock); retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); + mutex_unlock(&old_mm->context.lock); } return retval; } @@ -130,7 +130,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - down(&mm->context.sem); + mutex_lock(&mm->context.lock); size = mm->context.size*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -138,7 +138,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) err = 0; if (copy_to_user(ptr, mm->context.ldt, size)) err = -EFAULT; - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); if (err < 0) goto error_return; if (size != bytecount) { @@ -194,7 +194,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) goto out; } - down(&mm->context.sem); + mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= mm->context.size) { error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); if (error < 0) @@ -221,7 +221,7 @@ install: error = 0; out_unlock: - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); out: return error; } diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c index 3796523d616a..60e57abb8e90 100644 --- a/arch/x86/kernel/ldt_64.c +++ b/arch/x86/kernel/ldt_64.c @@ -96,13 +96,13 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) struct mm_struct * old_mm; int retval = 0; - init_MUTEX(&mm->context.sem); + mutex_init(&mm->context.lock); mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); + mutex_lock(&old_mm->context.lock); retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); + mutex_unlock(&old_mm->context.lock); } return retval; } @@ -133,7 +133,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - down(&mm->context.sem); + mutex_lock(&mm->context.lock); size = mm->context.size*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -141,7 +141,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) err = 0; if (copy_to_user(ptr, mm->context.ldt, size)) err = -EFAULT; - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); if (err < 0) goto error_return; if (size != bytecount) { @@ -193,7 +193,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) goto out; } - down(&mm->context.sem); + mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= (unsigned)mm->context.size) { error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); if (error < 0) @@ -223,7 +223,7 @@ install: error = 0; out_unlock: - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); out: return error; } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index deda9a221cf2..8459ca64bc2f 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -10,6 +10,7 @@ #include <linux/kexec.h> #include <linux/delay.h> #include <linux/init.h> +#include <linux/numa.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -169,3 +170,15 @@ static int __init parse_crashkernel(char *arg) return 0; } early_param("crashkernel", parse_crashkernel); + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifdef CONFIG_X86_PAE + VMCOREINFO_CONFIG(X86_PAE); +#endif +} + diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index cd1899a2f0c5..7450b69710b5 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -10,6 +10,7 @@ #include <linux/kexec.h> #include <linux/string.h> #include <linux/reboot.h> +#include <linux/numa.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -257,3 +258,11 @@ static int __init setup_crashkernel(char *arg) } early_param("crashkernel", setup_crashkernel); +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +} + diff --git a/arch/x86/kernel/mce_64.c b/arch/x86/kernel/mce_64.c index a66d607f5b92..66e6b797b2cb 100644 --- a/arch/x86/kernel/mce_64.c +++ b/arch/x86/kernel/mce_64.c @@ -76,9 +76,6 @@ void mce_log(struct mce *mce) wmb(); for (;;) { entry = rcu_dereference(mcelog.next); - /* The rmb forces the compiler to reload next in each - iteration */ - rmb(); for (;;) { /* When the buffer fills up discard new entries. Assume that the earlier errors are the more interesting. */ @@ -698,8 +695,6 @@ static int __init mcheck_disable(char *str) mce=nobootlog Don't log MCEs from before booting. */ static int __init mcheck_enable(char *str) { - if (*str == '=') - str++; if (!strcmp(str, "off")) mce_dont_init = 1; else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog")) @@ -712,7 +707,7 @@ static int __init mcheck_enable(char *str) } __setup("nomce", mcheck_disable); -__setup("mce", mcheck_enable); +__setup("mce=", mcheck_enable); /* * Sysfs support @@ -807,16 +802,29 @@ static __cpuinit int mce_create_device(unsigned int cpu) if (!mce_available(&cpu_data[cpu])) return -EIO; + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); per_cpu(device_mce,cpu).id = cpu; per_cpu(device_mce,cpu).cls = &mce_sysclass; err = sysdev_register(&per_cpu(device_mce,cpu)); + if (err) + return err; + + for (i = 0; mce_attributes[i]; i++) { + err = sysdev_create_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); + if (err) + goto error; + } - if (!err) { - for (i = 0; mce_attributes[i]; i++) - sysdev_create_file(&per_cpu(device_mce,cpu), - mce_attributes[i]); + return 0; +error: + while (i--) { + sysdev_remove_file(&per_cpu(device_mce,cpu), + mce_attributes[i]); } + sysdev_unregister(&per_cpu(device_mce,cpu)); + return err; } @@ -828,7 +836,6 @@ static void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), mce_attributes[i]); sysdev_unregister(&per_cpu(device_mce,cpu)); - memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ @@ -836,18 +843,21 @@ static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + int err = 0; switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = mce_create_device(cpu); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: mce_remove_device(cpu); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } static struct notifier_block mce_cpu_notifier = { @@ -862,9 +872,13 @@ static __init int mce_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; err = sysdev_class_register(&mce_sysclass); + if (err) + return err; for_each_online_cpu(i) { - mce_create_device(i); + err = mce_create_device(i); + if (err) + return err; } register_hotcpu_notifier(&mce_cpu_notifier); diff --git a/arch/x86/kernel/mce_amd_64.c b/arch/x86/kernel/mce_amd_64.c index 2f8a7f18b0fe..0d2afd96aca4 100644 --- a/arch/x86/kernel/mce_amd_64.c +++ b/arch/x86/kernel/mce_amd_64.c @@ -237,6 +237,7 @@ asmlinkage void mce_threshold_interrupt(void) } } out: + add_pda(irq_threshold_count, 1); irq_exit(); } @@ -472,7 +473,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifdef CONFIG_SMP if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ - i = first_cpu(cpu_core_map[cpu]); + i = first_cpu(per_cpu(cpu_core_map, cpu)); /* first core not up yet */ if (cpu_data[i].cpu_core_id) @@ -492,7 +493,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - b->cpus = cpu_core_map[cpu]; + b->cpus = per_cpu(cpu_core_map, cpu); per_cpu(threshold_banks, cpu)[bank] = b; goto out; } @@ -509,7 +510,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) #ifndef CONFIG_SMP b->cpus = CPU_MASK_ALL; #else - b->cpus = cpu_core_map[cpu]; + b->cpus = per_cpu(cpu_core_map, cpu); #endif err = kobject_register(&b->kobj); if (err) diff --git a/arch/x86/kernel/mce_intel_64.c b/arch/x86/kernel/mce_intel_64.c index 6551505d8a2c..c17eaf5dd6dd 100644 --- a/arch/x86/kernel/mce_intel_64.c +++ b/arch/x86/kernel/mce_intel_64.c @@ -26,6 +26,7 @@ asmlinkage void smp_thermal_interrupt(void) if (therm_throt_process(msr_val & 1)) mce_log_therm_throt_event(smp_processor_id(), msr_val); + add_pda(irq_thermal_count, 1); irq_exit(); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index c044de310b69..e18e516cf549 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -133,37 +133,42 @@ static const struct file_operations msr_fops = { .open = msr_open, }; -static int msr_device_create(int i) +static int __cpuinit msr_device_create(int cpu) { - int err = 0; struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i); - if (IS_ERR(dev)) - err = PTR_ERR(dev); - return err; + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + "msr%d", cpu); + return IS_ERR(dev) ? PTR_ERR(dev) : 0; } -static int msr_class_cpu_callback(struct notifier_block *nfb, +static void msr_device_destroy(int cpu) +{ + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); +} + +static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + int err = 0; switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - msr_device_create(cpu); + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = msr_device_create(cpu); break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + msr_device_destroy(cpu); break; } - return NOTIFY_OK; + return err ? NOTIFY_BAD : NOTIFY_OK; } -static struct notifier_block __cpuinitdata msr_class_cpu_notifier = -{ +static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; @@ -196,7 +201,7 @@ static int __init msr_init(void) out_class: i = 0; for_each_online_cpu(i) - device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); + msr_device_destroy(i); class_destroy(msr_class); out_chrdev: unregister_chrdev(MSR_MAJOR, "cpu/msr"); @@ -208,7 +213,7 @@ static void __exit msr_exit(void) { int cpu = 0; for_each_online_cpu(cpu) - device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + msr_device_destroy(cpu); class_destroy(msr_class); unregister_chrdev(MSR_MAJOR, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c index 739cfb207dd7..6a80d67c2121 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt_32.c @@ -42,32 +42,33 @@ void _paravirt_nop(void) static void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); + pv_info.name); } char *memory_setup(void) { - return paravirt_ops.memory_setup(); + return pv_init_ops.memory_setup(); } /* Simple instruction patching code. */ -#define DEF_NATIVE(name, code) \ - extern const char start_##name[], end_##name[]; \ - asm("start_" #name ": " code "; end_" #name ":") - -DEF_NATIVE(irq_disable, "cli"); -DEF_NATIVE(irq_enable, "sti"); -DEF_NATIVE(restore_fl, "push %eax; popf"); -DEF_NATIVE(save_fl, "pushf; pop %eax"); -DEF_NATIVE(iret, "iret"); -DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); -DEF_NATIVE(read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(clts, "clts"); -DEF_NATIVE(read_tsc, "rdtsc"); - -DEF_NATIVE(ud2a, "ud2a"); +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +/* Undefined instruction for dealing with missing ops pointers. */ +static const unsigned char ud2a[] = { 0x0f, 0x0b }; static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned ret; switch(type) { -#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site - SITE(irq_disable); - SITE(irq_enable); - SITE(restore_fl); - SITE(save_fl); - SITE(iret); - SITE(irq_enable_sysexit); - SITE(read_cr2); - SITE(read_cr3); - SITE(write_cr3); - SITE(clts); - SITE(read_tsc); +#define SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, restore_fl); + SITE(pv_irq_ops, save_fl); + SITE(pv_cpu_ops, iret); + SITE(pv_cpu_ops, irq_enable_sysexit); + SITE(pv_mmu_ops, read_cr2); + SITE(pv_mmu_ops, read_cr3); + SITE(pv_mmu_ops, write_cr3); + SITE(pv_cpu_ops, clts); + SITE(pv_cpu_ops, read_tsc); #undef SITE patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); break; - case PARAVIRT_PATCH(make_pgd): - case PARAVIRT_PATCH(make_pte): - case PARAVIRT_PATCH(pgd_val): - case PARAVIRT_PATCH(pte_val): -#ifdef CONFIG_X86_PAE - case PARAVIRT_PATCH(make_pmd): - case PARAVIRT_PATCH(pmd_val): -#endif - /* These functions end up returning exactly what - they're passed, in the same registers. */ - ret = paravirt_patch_nop(); - break; - default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; @@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf, return 5; } -unsigned paravirt_patch_jmp(const void *target, void *insnbuf, +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, unsigned long addr, unsigned len) { struct branch *b = insnbuf; @@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf, return 5; } +/* Neat trick to map patch type back to the call within the + * corresponding structure. */ +static void *get_call_destination(u8 type) +{ + struct paravirt_patch_template tmpl = { + .pv_init_ops = pv_init_ops, + .pv_time_ops = pv_time_ops, + .pv_cpu_ops = pv_cpu_ops, + .pv_irq_ops = pv_irq_ops, + .pv_apic_ops = pv_apic_ops, + .pv_mmu_ops = pv_mmu_ops, + }; + return *((void **)&tmpl + type); +} + unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, unsigned long addr, unsigned len) { - void *opfunc = *((void **)¶virt_ops + type); + void *opfunc = get_call_destination(type); unsigned ret; if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); + ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == paravirt_nop) /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); - else if (type == PARAVIRT_PATCH(iret) || - type == PARAVIRT_PATCH(irq_enable_sysexit)) + else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); + ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else /* Otherwise call the function; assume target could clobber any caller-save reg */ @@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, void init_IRQ(void) { - paravirt_ops.init_IRQ(); + pv_irq_ops.init_IRQ(); } static void native_flush_tlb(void) @@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void); static int __init print_banner(void) { - paravirt_ops.banner(); + pv_init_ops.banner(); return 0; } core_initcall(print_banner); @@ -273,47 +281,96 @@ int paravirt_disable_iospace(void) return ret; } -struct paravirt_ops paravirt_ops = { +static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + +static inline void enter_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(preemptible()); + + x86_write_percpu(paravirt_lazy_mode, mode); +} + +void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); + BUG_ON(preemptible()); + + x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); +} + +void paravirt_enter_lazy_mmu(void) +{ + enter_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_leave_lazy_mmu(void) +{ + paravirt_leave_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_enter_lazy_cpu(void) +{ + enter_lazy(PARAVIRT_LAZY_CPU); +} + +void paravirt_leave_lazy_cpu(void) +{ + paravirt_leave_lazy(PARAVIRT_LAZY_CPU); +} + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void) +{ + return x86_read_percpu(paravirt_lazy_mode); +} + +struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ +}; - .patch = native_patch, +struct pv_init_ops pv_init_ops = { + .patch = native_patch, .banner = default_banner, .arch_setup = paravirt_nop, .memory_setup = machine_specific_memory_setup, +}; + +struct pv_time_ops pv_time_ops = { + .time_init = hpet_time_init, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, - .time_init = hpet_time_init, + .sched_clock = native_sched_clock, + .get_cpu_khz = native_calculate_cpu_khz, +}; + +struct pv_irq_ops pv_irq_ops = { .init_IRQ = native_init_IRQ, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, +}; +struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr2 = native_read_cr2, - .write_cr2 = native_write_cr2, - .read_cr3 = native_read_cr3, - .write_cr3 = native_write_cr3, .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, - .save_fl = native_save_fl, - .restore_fl = native_restore_fl, - .irq_disable = native_irq_disable, - .irq_enable = native_irq_enable, - .safe_halt = native_safe_halt, - .halt = native_halt, .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .sched_clock = native_sched_clock, - .get_cpu_khz = native_calculate_cpu_khz, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, @@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = { .write_idt_entry = write_dt_entry, .load_esp0 = native_load_esp0, + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, + .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + }, +}; + +struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, .apic_write_atomic = native_apic_write_atomic, @@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = { .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif - .set_lazy_mode = paravirt_nop, +}; +struct pv_mmu_ops pv_mmu_ops = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .flush_tlb_user = native_flush_tlb, .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_single = native_flush_tlb_single, @@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = { .make_pte = native_make_pte, .make_pgd = native_make_pgd, - .irq_enable_sysexit = native_irq_enable_sysexit, - .iret = native_iret, - .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, .activate_mm = paravirt_nop, + + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + }, }; -EXPORT_SYMBOL(paravirt_ops); +EXPORT_SYMBOL_GPL(pv_time_ops); +EXPORT_SYMBOL_GPL(pv_cpu_ops); +EXPORT_SYMBOL_GPL(pv_mmu_ops); +EXPORT_SYMBOL_GPL(pv_apic_ops); +EXPORT_SYMBOL_GPL(pv_info); +EXPORT_SYMBOL (pv_irq_ops); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 71da01e73f03..5098f58063a5 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -35,6 +35,7 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/delay.h> +#include <linux/scatterlist.h> #include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> @@ -221,10 +222,10 @@ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) return npages; } -static inline int translate_phb(struct pci_dev* dev) +static inline int translation_enabled(struct iommu_table *tbl) { - int disabled = bus_info[dev->bus->number].translation_disabled; - return !disabled; + /* only PHBs with translation enabled have an IOMMU table */ + return (tbl != NULL); } static void iommu_range_reserve(struct iommu_table *tbl, @@ -384,31 +385,32 @@ static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int direction) { struct iommu_table *tbl = find_iommu_table(dev); + struct scatterlist *s; + int i; - if (!translate_phb(to_pci_dev(dev))) + if (!translation_enabled(tbl)) return; - while (nelems--) { + for_each_sg(sglist, s, nelems, i) { unsigned int npages; - dma_addr_t dma = sglist->dma_address; - unsigned int dmalen = sglist->dma_length; + dma_addr_t dma = s->dma_address; + unsigned int dmalen = s->dma_length; if (dmalen == 0) break; npages = num_dma_pages(dma, dmalen); iommu_free(tbl, dma, npages); - sglist++; } } static int calgary_nontranslate_map_sg(struct device* dev, struct scatterlist *sg, int nelems, int direction) { + struct scatterlist *s; int i; - for (i = 0; i < nelems; i++ ) { - struct scatterlist *s = &sg[i]; + for_each_sg(sg, s, nelems, i) { BUG_ON(!s->page); s->dma_address = virt_to_bus(page_address(s->page) +s->offset); s->dma_length = s->length; @@ -420,16 +422,16 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { struct iommu_table *tbl = find_iommu_table(dev); + struct scatterlist *s; unsigned long vaddr; unsigned int npages; unsigned long entry; int i; - if (!translate_phb(to_pci_dev(dev))) + if (!translation_enabled(tbl)) return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for (i = 0; i < nelems; i++ ) { - struct scatterlist *s = &sg[i]; + for_each_sg(sg, s, nelems, i) { BUG_ON(!s->page); vaddr = (unsigned long)page_address(s->page) + s->offset; @@ -454,9 +456,9 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, return nelems; error: calgary_unmap_sg(dev, sg, nelems, direction); - for (i = 0; i < nelems; i++) { - sg[i].dma_address = bad_dma_address; - sg[i].dma_length = 0; + for_each_sg(sg, s, nelems, i) { + sg->dma_address = bad_dma_address; + sg->dma_length = 0; } return 0; } @@ -472,7 +474,7 @@ static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); - if (translate_phb(to_pci_dev(dev))) + if (translation_enabled(tbl)) dma_handle = iommu_alloc(tbl, vaddr, npages, direction); else dma_handle = virt_to_bus(vaddr); @@ -486,7 +488,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translate_phb(to_pci_dev(dev))) + if (!translation_enabled(tbl)) return; npages = num_dma_pages(dma_handle, size); @@ -511,7 +513,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, goto error; memset(ret, 0, size); - if (translate_phb(to_pci_dev(dev))) { + if (translation_enabled(tbl)) { /* set up tces to cover the allocated range */ mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); if (mapping == bad_dma_address) @@ -1192,7 +1194,7 @@ static int __init calgary_init(void) { int ret; struct pci_dev *dev = NULL; - void *tce_space; + struct calgary_bus_info *info; ret = calgary_locate_bbars(); if (ret) @@ -1204,12 +1206,14 @@ static int __init calgary_init(void) break; if (!is_cal_pci_dev(dev->device)) continue; - if (!translate_phb(dev)) { + + info = &bus_info[dev->bus->number]; + if (info->translation_disabled) { calgary_init_one_nontraslated(dev); continue; } - tce_space = bus_info[dev->bus->number].tce_space; - if (!tce_space && !translate_empty_slots) + + if (!info->tce_space && !translate_empty_slots) continue; ret = calgary_init_one(dev); @@ -1227,11 +1231,13 @@ error: break; if (!is_cal_pci_dev(dev->device)) continue; - if (!translate_phb(dev)) { + + info = &bus_info[dev->bus->number]; + if (info->translation_disabled) { pci_dev_put(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + if (!info->tce_space && !translate_empty_slots) continue; calgary_disable_translation(dev); @@ -1544,7 +1550,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) static int __init calgary_fixup_tce_spaces(void) { struct pci_dev *dev = NULL; - void *tce_space; + struct calgary_bus_info *info; if (no_iommu || swiotlb || !calgary_detected) return -ENODEV; @@ -1557,11 +1563,12 @@ static int __init calgary_fixup_tce_spaces(void) break; if (!is_cal_pci_dev(dev->device)) continue; - if (!translate_phb(dev)) + + info = &bus_info[dev->bus->number]; + if (info->translation_disabled) continue; - tce_space = bus_info[dev->bus->number].tce_space; - if (!tce_space) + if (!info->tce_space) continue; calgary_fixup_one_tce_space(dev); diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c index 0aae2f3847a5..51330321a5d3 100644 --- a/arch/x86/kernel/pci-dma_32.c +++ b/arch/x86/kernel/pci-dma_32.c @@ -12,7 +12,6 @@ #include <linux/string.h> #include <linux/pci.h> #include <linux/module.h> -#include <linux/pci.h> #include <asm/io.h> struct dma_coherent_mem { diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c index 9576a2eb375e..b2b42bdb0a15 100644 --- a/arch/x86/kernel/pci-dma_64.c +++ b/arch/x86/kernel/pci-dma_64.c @@ -51,11 +51,9 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { struct page *page; int node; -#ifdef CONFIG_PCI - if (dev->bus == &pci_bus_type) - node = pcibus_to_node(to_pci_dev(dev)->bus); - else -#endif + + node = dev_to_node(dev); + if (node == -1) node = numa_node_id(); if (node < first_node(node_online_map)) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 4918c575d582..5cdfab65e93f 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -8,6 +8,7 @@ * See Documentation/DMA-mapping.txt for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU General Public License v2 only. */ #include <linux/types.h> @@ -23,6 +24,7 @@ #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/kdebug.h> +#include <linux/scatterlist.h> #include <asm/atomic.h> #include <asm/io.h> #include <asm/mtrr.h> @@ -278,10 +280,10 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, */ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { + struct scatterlist *s; int i; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; + for_each_sg(sg, s, nents, i) { if (!s->dma_length || !s->length) break; gart_unmap_single(dev, s->dma_address, s->dma_length, dir); @@ -292,14 +294,14 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int nents, int dir) { + struct scatterlist *s; int i; #ifdef CONFIG_IOMMU_DEBUG printk(KERN_DEBUG "dma_map_sg overflow\n"); #endif - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; + for_each_sg(sg, s, nents, i) { unsigned long addr = page_to_phys(s->page) + s->offset; if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir); @@ -319,23 +321,23 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, } /* Map multiple scatterlist entries continuous into the first. */ -static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, +static int __dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages) { unsigned long iommu_start = alloc_iommu(pages); unsigned long iommu_page = iommu_start; + struct scatterlist *s; int i; if (iommu_start == -1) return -1; - - for (i = start; i < stopat; i++) { - struct scatterlist *s = &sg[i]; + + for_each_sg(start, s, nelems, i) { unsigned long pages, addr; unsigned long phys_addr = s->dma_address; - BUG_ON(i > start && s->offset); - if (i == start) { + BUG_ON(s != start && s->offset); + if (s == start) { *sout = *s; sout->dma_address = iommu_bus_base; sout->dma_address += iommu_page*PAGE_SIZE + s->offset; @@ -357,30 +359,32 @@ static int __dma_map_cont(struct scatterlist *sg, int start, int stopat, return 0; } -static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat, +static inline int dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages, int need) { - if (!need) { - BUG_ON(stopat - start != 1); - *sout = sg[start]; - sout->dma_length = sg[start].length; + if (!need) { + BUG_ON(nelems != 1); + *sout = *start; + sout->dma_length = start->length; return 0; - } - return __dma_map_cont(sg, start, stopat, sout, pages); + } + return __dma_map_cont(start, nelems, sout, pages); } /* * DMA map all entries in a scatterlist. * Merge chunks that have page aligned sizes into a continuous mapping. */ -int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, + int dir) { int i; int out; int start; unsigned long pages = 0; int need = 0, nextneed; + struct scatterlist *s, *ps, *start_sg, *sgmap; if (nents == 0) return 0; @@ -390,8 +394,9 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) out = 0; start = 0; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; + start_sg = sgmap = sg; + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = page_to_phys(s->page) + s->offset; s->dma_address = addr; BUG_ON(s->length == 0); @@ -400,29 +405,33 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) /* Handle the previous not yet processed entries */ if (i > start) { - struct scatterlist *ps = &sg[i-1]; /* Can only merge when the last chunk ends on a page boundary and the new one doesn't have an offset. */ if (!iommu_merge || !nextneed || !need || s->offset || - (ps->offset + ps->length) % PAGE_SIZE) { - if (dma_map_cont(sg, start, i, sg+out, pages, - need) < 0) + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(start_sg, i - start, sgmap, + pages, need) < 0) goto error; out++; + sgmap = sg_next(sgmap); pages = 0; - start = i; + start = i; + start_sg = s; } } need = nextneed; pages += to_pages(s->offset, s->length); + ps = s; } - if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) + if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0) goto error; out++; flush_gart(); - if (out < nents) - sg[out].dma_length = 0; + if (out < nents) { + sgmap = sg_next(sgmap); + sgmap->dma_length = 0; + } return out; error: @@ -437,8 +446,8 @@ error: if (panic_on_overflow) panic("dma_map_sg: overflow on %lu pages\n", pages); iommu_full(dev, pages << PAGE_SHIFT, dir); - for (i = 0; i < nents; i++) - sg[i].dma_address = bad_dma_address; + for_each_sg(sg, s, nents, i) + s->dma_address = bad_dma_address; return 0; } diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu_64.c index 2a34c6c025a9..e85d4360360c 100644 --- a/arch/x86/kernel/pci-nommu_64.c +++ b/arch/x86/kernel/pci-nommu_64.c @@ -5,6 +5,7 @@ #include <linux/pci.h> #include <linux/string.h> #include <linux/dma-mapping.h> +#include <linux/scatterlist.h> #include <asm/iommu.h> #include <asm/processor.h> @@ -57,10 +58,10 @@ static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction) { + struct scatterlist *s; int i; - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; + for_each_sg(sg, s, nents, i) { BUG_ON(!s->page); s->dma_address = virt_to_bus(page_address(s->page) +s->offset); if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7352d4b377e6..6309b275cb9c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -581,7 +581,7 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, * * Kprobes not supported here. Set the probe on schedule instead. */ -__kprobes struct task_struct * +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c index 0cecd7513c97..99102ec5fade 100644 --- a/arch/x86/kernel/ptrace_32.c +++ b/arch/x86/kernel/ptrace_32.c @@ -165,7 +165,7 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ seg &= ~7UL; - down(&child->mm->context.sem); + mutex_lock(&child->mm->context.lock); if (unlikely((seg >> 3) >= child->mm->context.size)) addr = -1L; /* bogus selector, access would fault */ else { @@ -179,7 +179,7 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ addr &= 0xffff; addr += base; } - up(&child->mm->context.sem); + mutex_unlock(&child->mm->context.lock); } return addr; } @@ -524,11 +524,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) ret = 0; break; - case PTRACE_DETACH: - /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - break; - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { ret = -EIO; diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c index c0cac42df3b6..607085f3f08a 100644 --- a/arch/x86/kernel/ptrace_64.c +++ b/arch/x86/kernel/ptrace_64.c @@ -103,7 +103,7 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r seg &= ~7UL; - down(&child->mm->context.sem); + mutex_lock(&child->mm->context.lock); if (unlikely((seg >> 3) >= child->mm->context.size)) addr = -1L; /* bogus selector, access would fault */ else { @@ -117,7 +117,7 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r addr &= 0xffff; addr += base; } - up(&child->mm->context.sem); + mutex_unlock(&child->mm->context.lock); } return addr; @@ -500,11 +500,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) ret = 0; break; - case PTRACE_DETACH: - /* detach a process that was attached. */ - ret = ptrace_detach(child, data); - break; - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, sizeof(struct user_regs_struct))) { diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 1200aaac403e..ba9188235057 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -11,7 +11,6 @@ #include <linux/bootmem.h> #include <linux/bitops.h> #include <linux/module.h> -#include <asm/bootsetup.h> #include <asm/pda.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -23,8 +22,9 @@ #include <asm/percpu.h> #include <asm/proto.h> #include <asm/sections.h> +#include <asm/setup.h> -char x86_boot_params[BOOT_PARAM_SIZE] __initdata; +struct boot_params __initdata boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index c8e1bc38d421..b87a6fd5ba48 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -137,10 +137,11 @@ EXPORT_SYMBOL(edd); */ static inline void copy_edd(void) { - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; } #else static inline void copy_edd(void) @@ -434,17 +435,20 @@ void __init setup_bootmem_allocator(void) #endif numa_kva_reserve(); #ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { - reserve_bootmem(INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; - } - else { + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + + if (ramdisk_end <= end_of_lowmem) { + reserve_bootmem(ramdisk_image, ramdisk_size); + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; + } else { printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - INITRD_START + INITRD_SIZE, - max_low_pfn << PAGE_SHIFT); + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + ramdisk_end, end_of_lowmem); initrd_start = 0; } } @@ -512,28 +516,29 @@ void __init setup_arch(char **cmdline_p) * the system table is valid. If not, then initialize normally. */ #ifdef CONFIG_EFI - if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) + if ((boot_params.hdr.type_of_loader == 0x50) && + boot_params.efi_info.efi_systab) efi_enabled = 1; #endif - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - screen_info = SCREEN_INFO; - edid_info = EDID_INFO; - apm_info.bios = APM_BIOS_INFO; - ist_info = IST_INFO; - saved_videomode = VIDEO_MODE; - if( SYS_DESC_TABLE.length != 0 ) { - set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); - machine_id = SYS_DESC_TABLE.table[0]; - machine_submodel_id = SYS_DESC_TABLE.table[1]; - BIOS_revision = SYS_DESC_TABLE.table[2]; + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; + saved_videomode = boot_params.hdr.vid_mode; + if( boot_params.sys_desc_table.length != 0 ) { + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); + machine_id = boot_params.sys_desc_table.table[0]; + machine_submodel_id = boot_params.sys_desc_table.table[1]; + BIOS_revision = boot_params.sys_desc_table.table[2]; } - bootloader_type = LOADER_TYPE; + bootloader_type = boot_params.hdr.type_of_loader; #ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif ARCH_SETUP if (efi_enabled) @@ -545,7 +550,7 @@ void __init setup_arch(char **cmdline_p) copy_edd(); - if (!MOUNT_ROOT_RDONLY) + if (!boot_params.hdr.root_flags) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index b7da90e79c78..5a19f0cc5b67 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -52,7 +52,6 @@ #include <asm/dma.h> #include <asm/mpspec.h> #include <asm/mmu_context.h> -#include <asm/bootsetup.h> #include <asm/proto.h> #include <asm/setup.h> #include <asm/mach_apic.h> @@ -180,10 +179,11 @@ EXPORT_SYMBOL(edd); */ static inline void copy_edd(void) { - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; } #else static inline void copy_edd(void) @@ -220,21 +220,21 @@ void __init setup_arch(char **cmdline_p) { printk(KERN_INFO "Command line: %s\n", boot_command_line); - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - screen_info = SCREEN_INFO; - edid_info = EDID_INFO; - saved_video_mode = SAVED_VIDEO_MODE; - bootloader_type = LOADER_TYPE; + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; #ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif setup_memory_region(); copy_edd(); - if (!MOUNT_ROOT_RDONLY) + if (!boot_params.hdr.root_flags) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) &_text; init_mm.end_code = (unsigned long) &_etext; @@ -339,17 +339,20 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); #ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { - reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; - } - else { + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_mem = end_pfn << PAGE_SHIFT; + + if (ramdisk_end <= end_of_mem) { + reserve_bootmem_generic(ramdisk_image, ramdisk_size); + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; + } else { printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - (unsigned long)(INITRD_START + INITRD_SIZE), - (unsigned long)(end_pfn << PAGE_SHIFT)); + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + ramdisk_end, end_of_mem); initrd_start = 0; } } @@ -601,7 +604,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 0x10) + if (c->x86 == 0x10 || c->x86 == 0x11) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); /* Enable workaround for FXSAVE leak */ @@ -965,7 +968,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) * applications want to get the raw CPUID data, they should access * /dev/cpu/<cpu_nr>/cpuid instead. */ - static char *x86_cap_flags[] = { + static const char *const x86_cap_flags[] = { /* Intel-defined */ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", @@ -1019,7 +1022,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; - static char *x86_power_flags[] = { + static const char *const x86_power_flags[] = { "ts", /* temperature sensor */ "fid", /* frequency id control */ "vid", /* voltage id control */ @@ -1070,7 +1073,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (smp_num_siblings * c->x86_max_cores > 1) { int cpu = c - cpu_data; seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); + seq_printf(m, "siblings\t: %d\n", + cpus_weight(per_cpu(cpu_core_map, cpu))); seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d01d51fcce2a..0d79df3c5631 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -385,7 +385,6 @@ static int setup_frame(int sig, struct k_sigaction *ka, regs->edx = (unsigned long) 0; regs->ecx = (unsigned long) 0; - set_fs(USER_DS); regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; @@ -479,7 +478,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->edx = (unsigned long) &frame->info; regs->ecx = (unsigned long) &frame->uc; - set_fs(USER_DS); regs->xds = __USER_DS; regs->xes = __USER_DS; regs->xss = __USER_DS; diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index 2d35d8502029..791d9f8036ae 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -342,6 +342,7 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); + __get_cpu_var(irq_stat).irq_tlb_count++; } void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, @@ -640,6 +641,7 @@ static void native_smp_send_stop(void) fastcall void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + __get_cpu_var(irq_stat).irq_resched_count++; } fastcall void smp_call_function_interrupt(struct pt_regs *regs) @@ -660,6 +662,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) */ irq_enter(); (*func)(info); + __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); if (wait) { @@ -705,3 +708,10 @@ struct smp_ops smp_ops = { .smp_send_reschedule = native_smp_send_reschedule, .smp_call_function_mask = native_smp_call_function_mask, }; + +int smp_call_function_mask(cpumask_t mask, void (*func) (void *info), + void *info, int wait) +{ + return smp_ops.smp_call_function_mask(mask, func, info, wait); +} +EXPORT_SYMBOL(smp_call_function_mask); diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c index df4a82812adb..5c2964727d19 100644 --- a/arch/x86/kernel/smp_64.c +++ b/arch/x86/kernel/smp_64.c @@ -163,6 +163,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); + add_pda(irq_tlb_count, 1); } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, @@ -493,6 +494,7 @@ void smp_send_stop(void) asmlinkage void smp_reschedule_interrupt(void) { ack_APIC_irq(); + add_pda(irq_resched_count, 1); } asmlinkage void smp_call_function_interrupt(void) @@ -514,6 +516,7 @@ asmlinkage void smp_call_function_interrupt(void) exit_idle(); irq_enter(); (*func)(info); + add_pda(irq_call_count, 1); irq_exit(); if (wait) { mb(); diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index e4f61d1c6248..be3faac04719 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -70,12 +70,12 @@ EXPORT_SYMBOL(smp_num_siblings); int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; /* representing HT siblings of each logical CPU */ -cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(cpu_sibling_map); +DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ -cpumask_t cpu_core_map[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_t, cpu_core_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* bitmap of online cpus */ cpumask_t cpu_online_map __read_mostly; @@ -102,8 +102,8 @@ u8 apicid_2_node[MAX_APICID]; * Trampoline 80x86 program as an array. */ -extern unsigned char trampoline_data []; -extern unsigned char trampoline_end []; +extern const unsigned char trampoline_data []; +extern const unsigned char trampoline_end []; static unsigned char *trampoline_base; static int trampoline_exec; @@ -118,7 +118,7 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; * has made sure it's suitably aligned. */ -static unsigned long __devinit setup_trampoline(void) +static unsigned long __cpuinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(trampoline_base); @@ -300,7 +300,7 @@ cpumask_t cpu_coregroup_map(int cpu) * And for power savings, we return cpu_core_map */ if (sched_mc_power_savings || sched_smt_power_savings) - return cpu_core_map[cpu]; + return per_cpu(cpu_core_map, cpu); else return c->llc_shared_map; } @@ -319,22 +319,22 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu_mask(i, cpu_sibling_setup_map) { if (c[cpu].phys_proc_id == c[i].phys_proc_id && c[cpu].cpu_core_id == c[i].cpu_core_id) { - cpu_set(i, cpu_sibling_map[cpu]); - cpu_set(cpu, cpu_sibling_map[i]); - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, per_cpu(cpu_sibling_map, cpu)); + cpu_set(cpu, per_cpu(cpu_sibling_map, i)); + cpu_set(i, per_cpu(cpu_core_map, cpu)); + cpu_set(cpu, per_cpu(cpu_core_map, i)); cpu_set(i, c[cpu].llc_shared_map); cpu_set(cpu, c[i].llc_shared_map); } } } else { - cpu_set(cpu, cpu_sibling_map[cpu]); + cpu_set(cpu, per_cpu(cpu_sibling_map, cpu)); } cpu_set(cpu, c[cpu].llc_shared_map); if (current_cpu_data.x86_max_cores == 1) { - cpu_core_map[cpu] = cpu_sibling_map[cpu]; + per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu); c[cpu].booted_cores = 1; return; } @@ -346,17 +346,17 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpu_set(cpu, c[i].llc_shared_map); } if (c[cpu].phys_proc_id == c[i].phys_proc_id) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, per_cpu(cpu_core_map, cpu)); + cpu_set(cpu, per_cpu(cpu_core_map, i)); /* * Does this new cpu bringup a new core? */ - if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) { /* * for each core in package, increment * the booted_cores for this new cpu */ - if (first_cpu(cpu_sibling_map[i]) == i) + if (first_cpu(per_cpu(cpu_sibling_map, i)) == i) c[cpu].booted_cores++; /* * increment the core count for all @@ -983,8 +983,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); map_cpu_to_logical_apicid(); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); + cpu_set(0, per_cpu(cpu_sibling_map, 0)); + cpu_set(0, per_cpu(cpu_core_map, 0)); return; } @@ -1008,8 +1008,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus) printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); smpboot_clear_io_apic_irqs(); phys_cpu_present_map = physid_mask_of_physid(0); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); + cpu_set(0, per_cpu(cpu_sibling_map, 0)); + cpu_set(0, per_cpu(cpu_core_map, 0)); return; } @@ -1021,10 +1021,16 @@ static void __init smp_boot_cpus(unsigned int max_cpus) if (!max_cpus) { smp_found_config = 0; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); + + if (nmi_watchdog == NMI_LOCAL_APIC) { + printk(KERN_INFO "activating minimal APIC for NMI watchdog use.\n"); + connect_bsp_APIC(); + setup_local_APIC(); + } smpboot_clear_io_apic_irqs(); phys_cpu_present_map = physid_mask_of_physid(0); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); + cpu_set(0, per_cpu(cpu_sibling_map, 0)); + cpu_set(0, per_cpu(cpu_core_map, 0)); return; } @@ -1102,16 +1108,16 @@ static void __init smp_boot_cpus(unsigned int max_cpus) Dprintk("Boot done.\n"); /* - * construct cpu_sibling_map[], so that we can tell sibling CPUs + * construct cpu_sibling_map, so that we can tell sibling CPUs * efficiently. */ for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); } - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); + cpu_set(0, per_cpu(cpu_sibling_map, 0)); + cpu_set(0, per_cpu(cpu_core_map, 0)); smpboot_setup_io_apic(); @@ -1148,19 +1154,19 @@ void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = cpu_data; - for_each_cpu_mask(sibling, cpu_core_map[cpu]) { - cpu_clear(cpu, cpu_core_map[sibling]); - /* + for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { + cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); + /*/ * last thread sibling in this cpu core going down */ - if (cpus_weight(cpu_sibling_map[cpu]) == 1) + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) c[sibling].booted_cores--; } - for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) - cpu_clear(cpu, cpu_sibling_map[sibling]); - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); c[cpu].phys_proc_id = 0; c[cpu].cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index 720a7d1f8862..e351ac4ab5b1 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -91,19 +91,19 @@ EXPORT_SYMBOL(cpu_data); int smp_threads_ready; /* representing HT siblings of each logical CPU */ -cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(cpu_sibling_map); +DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ -cpumask_t cpu_core_map[NR_CPUS] __read_mostly; -EXPORT_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_t, cpu_core_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* * Trampoline 80x86 program as an array. */ -extern unsigned char trampoline_data[]; -extern unsigned char trampoline_end[]; +extern const unsigned char trampoline_data[]; +extern const unsigned char trampoline_end[]; /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -243,7 +243,7 @@ cpumask_t cpu_coregroup_map(int cpu) * And for power savings, we return cpu_core_map */ if (sched_mc_power_savings || sched_smt_power_savings) - return cpu_core_map[cpu]; + return per_cpu(cpu_core_map, cpu); else return c->llc_shared_map; } @@ -262,22 +262,22 @@ static inline void set_cpu_sibling_map(int cpu) for_each_cpu_mask(i, cpu_sibling_setup_map) { if (c[cpu].phys_proc_id == c[i].phys_proc_id && c[cpu].cpu_core_id == c[i].cpu_core_id) { - cpu_set(i, cpu_sibling_map[cpu]); - cpu_set(cpu, cpu_sibling_map[i]); - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, per_cpu(cpu_sibling_map, cpu)); + cpu_set(cpu, per_cpu(cpu_sibling_map, i)); + cpu_set(i, per_cpu(cpu_core_map, cpu)); + cpu_set(cpu, per_cpu(cpu_core_map, i)); cpu_set(i, c[cpu].llc_shared_map); cpu_set(cpu, c[i].llc_shared_map); } } } else { - cpu_set(cpu, cpu_sibling_map[cpu]); + cpu_set(cpu, per_cpu(cpu_sibling_map, cpu)); } cpu_set(cpu, c[cpu].llc_shared_map); if (current_cpu_data.x86_max_cores == 1) { - cpu_core_map[cpu] = cpu_sibling_map[cpu]; + per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu); c[cpu].booted_cores = 1; return; } @@ -289,17 +289,17 @@ static inline void set_cpu_sibling_map(int cpu) cpu_set(cpu, c[i].llc_shared_map); } if (c[cpu].phys_proc_id == c[i].phys_proc_id) { - cpu_set(i, cpu_core_map[cpu]); - cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, per_cpu(cpu_core_map, cpu)); + cpu_set(cpu, per_cpu(cpu_core_map, i)); /* * Does this new cpu bringup a new core? */ - if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) { /* * for each core in package, increment * the booted_cores for this new cpu */ - if (first_cpu(cpu_sibling_map[i]) == i) + if (first_cpu(per_cpu(cpu_sibling_map, i)) == i) c[cpu].booted_cores++; /* * increment the core count for all @@ -695,7 +695,6 @@ do_rest: cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); x86_cpu_to_apicid[cpu] = BAD_APICID; - x86_cpu_to_log_apicid[cpu] = BAD_APICID; return -EIO; } @@ -735,8 +734,8 @@ static __init void disable_smp(void) phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); else phys_cpu_present_map = physid_mask_of_physid(0); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); + cpu_set(0, per_cpu(cpu_sibling_map, 0)); + cpu_set(0, per_cpu(cpu_core_map, 0)); } #ifdef CONFIG_HOTPLUG_CPU @@ -971,19 +970,19 @@ static void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = cpu_data; - for_each_cpu_mask(sibling, cpu_core_map[cpu]) { - cpu_clear(cpu, cpu_core_map[sibling]); + for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { + cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); /* * last thread sibling in this cpu core going down */ - if (cpus_weight(cpu_sibling_map[cpu]) == 1) + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) c[sibling].booted_cores--; } - for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) - cpu_clear(cpu, cpu_sibling_map[sibling]); - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); c[cpu].phys_proc_id = 0; c[cpu].cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 413e527cdeb9..6fa6cf036c70 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -33,7 +33,7 @@ static void save_stack_address(void *data, unsigned long addr) trace->entries[trace->nr_entries++] = addr; } -static struct stacktrace_ops save_stack_ops = { +static const struct stacktrace_ops save_stack_ops = { .warning = save_stack_warning, .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c index 573c0a6e0ac6..f8fafe527ff1 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/kernel/suspend_64.c @@ -150,8 +150,22 @@ void fix_processor_context(void) /* Defined in arch/x86_64/kernel/suspend_asm.S */ extern int restore_image(void); +/* + * Address to jump to in the last phase of restore in order to get to the image + * kernel's text (this value is passed in the image header). + */ +unsigned long restore_jump_address; + +/* + * Value of the cr3 register from before the hibernation (this value is passed + * in the image header). + */ +unsigned long restore_cr3; + pgd_t *temp_level4_pgt; +void *relocated_restore_code; + static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { long i, j; @@ -175,7 +189,7 @@ static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long en if (paddr >= end) break; - pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr; + pe = __PAGE_KERNEL_LARGE_EXEC | paddr; pe &= __supported_pte_mask; set_pmd(pmd, __pmd(pe)); } @@ -183,25 +197,42 @@ static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long en return 0; } +static int res_kernel_text_pud_init(pud_t *pud, unsigned long start) +{ + pmd_t *pmd; + unsigned long paddr; + + pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd) + return -ENOMEM; + set_pud(pud + pud_index(start), __pud(__pa(pmd) | _KERNPG_TABLE)); + for (paddr = 0; paddr < KERNEL_TEXT_SIZE; pmd++, paddr += PMD_SIZE) { + unsigned long pe; + + pe = __PAGE_KERNEL_LARGE_EXEC | _PAGE_GLOBAL | paddr; + pe &= __supported_pte_mask; + set_pmd(pmd, __pmd(pe)); + } + + return 0; +} + static int set_up_temporary_mappings(void) { unsigned long start, end, next; + pud_t *pud; int error; temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC); if (!temp_level4_pgt) return -ENOMEM; - /* It is safe to reuse the original kernel mapping */ - set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), - init_level4_pgt[pgd_index(__START_KERNEL_map)]); - /* Set up the direct mapping from scratch */ start = (unsigned long)pfn_to_kaddr(0); end = (unsigned long)pfn_to_kaddr(end_pfn); for (; start < end; start = next) { - pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC); + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; next = start + PGDIR_SIZE; @@ -212,7 +243,17 @@ static int set_up_temporary_mappings(void) set_pgd(temp_level4_pgt + pgd_index(start), mk_kernel_pgd(__pa(pud))); } - return 0; + + /* Set up the kernel text mapping from scratch */ + pud = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!pud) + return -ENOMEM; + error = res_kernel_text_pud_init(pud, __START_KERNEL_map); + if (!error) + set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map), + __pgd(__pa(pud) | _PAGE_TABLE)); + + return error; } int swsusp_arch_resume(void) @@ -222,6 +263,13 @@ int swsusp_arch_resume(void) /* We have got enough memory and from now on we cannot recover */ if ((error = set_up_temporary_mappings())) return error; + + relocated_restore_code = (void *)get_safe_page(GFP_ATOMIC); + if (!relocated_restore_code) + return -ENOMEM; + memcpy(relocated_restore_code, &core_restore_code, + &restore_registers - &core_restore_code); + restore_image(); return 0; } @@ -236,4 +284,43 @@ int pfn_is_nosave(unsigned long pfn) unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } + +struct restore_data_record { + unsigned long jump_address; + unsigned long cr3; + unsigned long magic; +}; + +#define RESTORE_MAGIC 0x0123456789ABCDEFUL + +/** + * arch_hibernation_header_save - populate the architecture specific part + * of a hibernation image header + * @addr: address to save the data at + */ +int arch_hibernation_header_save(void *addr, unsigned int max_size) +{ + struct restore_data_record *rdr = addr; + + if (max_size < sizeof(struct restore_data_record)) + return -EOVERFLOW; + rdr->jump_address = restore_jump_address; + rdr->cr3 = restore_cr3; + rdr->magic = RESTORE_MAGIC; + return 0; +} + +/** + * arch_hibernation_header_restore - read the architecture specific data + * from the hibernation image header + * @addr: address to read the data from + */ +int arch_hibernation_header_restore(void *addr) +{ + struct restore_data_record *rdr = addr; + + restore_jump_address = rdr->jump_address; + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; +} #endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S index 16d183f67bc1..48344b666d2c 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/kernel/suspend_asm_64.S @@ -2,8 +2,8 @@ * * Distribute under GPLv2. * - * swsusp_arch_resume may not use any stack, nor any variable that is - * not "NoSave" during copying pages: + * swsusp_arch_resume must not use any stack or any nonlocal variables while + * copying pages: * * Its rewriting one kernel image with another. What is stack in "old" * image could very well be data page in "new" image, and overwriting @@ -36,6 +36,13 @@ ENTRY(swsusp_arch_suspend) movq %r15, saved_context_r15(%rip) pushfq ; popq saved_context_eflags(%rip) + /* save the address of restore_registers */ + movq $restore_registers, %rax + movq %rax, restore_jump_address(%rip) + /* save cr3 */ + movq %cr3, %rax + movq %rax, restore_cr3(%rip) + call swsusp_save ret @@ -54,7 +61,17 @@ ENTRY(restore_image) movq %rcx, %cr3; movq %rax, %cr4; # turn PGE back on + /* prepare to jump to the image kernel */ + movq restore_jump_address(%rip), %rax + movq restore_cr3(%rip), %rbx + + /* prepare to copy image data to their original locations */ movq restore_pblist(%rip), %rdx + movq relocated_restore_code(%rip), %rcx + jmpq *%rcx + + /* code below has been relocated to a safe page */ +ENTRY(core_restore_code) loop: testq %rdx, %rdx jz done @@ -62,7 +79,7 @@ loop: /* get addresses from the pbe and copy the page */ movq pbe_address(%rdx), %rsi movq pbe_orig_address(%rdx), %rdi - movq $512, %rcx + movq $(PAGE_SIZE >> 3), %rcx rep movsq @@ -70,10 +87,22 @@ loop: movq pbe_next(%rdx), %rdx jmp loop done: + /* jump to the restore_registers address from the image header */ + jmpq *%rax + /* + * NOTE: This assumes that the boot kernel's text mapping covers the + * image kernel's page containing restore_registers and the address of + * this page is the same as in the image kernel's text mapping (it + * should always be true, because the text mapping is linear, starting + * from 0, and is supposed to cover the entire kernel text for every + * kernel). + * + * code below belongs to the image kernel + */ + +ENTRY(restore_registers) /* go back to the original page tables */ - movq $(init_level4_pgt - __START_KERNEL_map), %rax - addq phys_base(%rip), %rax - movq %rax, %cr3 + movq %rbx, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax @@ -84,12 +113,9 @@ done: movq %rcx, %cr3 movq %rax, %cr4; # turn PGE back on - movl $24, %eax - movl %eax, %ds - movq saved_context_esp(%rip), %rsp movq saved_context_ebp(%rip), %rbp - /* Don't restore %rax, it must be 0 anyway */ + /* restore GPRs (we don't restore %rax, it must be 0 anyway) */ movq saved_context_ebx(%rip), %rbx movq saved_context_ecx(%rip), %rcx movq saved_context_edx(%rip), %rdx @@ -107,4 +133,7 @@ done: xorq %rax, %rax + /* tell the hibernation core that we've just restored the memory */ + movq %rax, in_suspend(%rip) + ret diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index f8bae9ba0324..a86d26f036e1 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -17,10 +17,10 @@ #include <linux/mman.h> #include <linux/file.h> #include <linux/utsname.h> +#include <linux/ipc.h> #include <asm/uaccess.h> #include <asm/unistd.h> -#include <asm/ipc.h> /* * sys_pipe() is the normal C calling standard for creating diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index e3f2569b2c44..9e540fee7009 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -40,9 +40,9 @@ static inline void flush_tce(void* tceaddr) { /* a single tce can't cross a cache line */ if (cpu_has_clflush) - asm volatile("clflush (%0)" :: "r" (tceaddr)); + clflush(tceaddr); else - asm volatile("wbinvd":::"memory"); + wbinvd(); } void tce_build(struct iommu_table *tbl, unsigned long index, diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index c25f23eb397c..8caa0b777466 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -44,15 +44,15 @@ int arch_register_cpu(int num) * Also certain PCI quirks require not to enable hotplug control * for all CPU's. */ - if (num && enable_cpu_hotplug) +#ifdef CONFIG_HOTPLUG_CPU + if (num) cpu_devices[num].cpu.hotpluggable = 1; +#endif return register_cpu(&cpu_devices[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU -int enable_cpu_hotplug = 1; - void arch_unregister_cpu(int num) { return unregister_cpu(&cpu_devices[num].cpu); } diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index f62815f8d06a..9bcc1c6aca3d 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -36,11 +36,11 @@ #include <asm/segment.h> #include <asm/page.h> -.data - /* We can free up trampoline after bootup if cpu hotplug is not supported. */ #ifndef CONFIG_HOTPLUG_CPU .section ".init.data","aw",@progbits +#else +.section .rodata,"a",@progbits #endif .code16 diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 607983b0d27b..e30b67c6a9f5 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -33,7 +33,12 @@ #include <asm/msr.h> #include <asm/segment.h> -.data +/* We can free up trampoline after bootup if cpu hotplug is not supported. */ +#ifndef CONFIG_HOTPLUG_CPU +.section .init.data, "aw", @progbits +#else +.section .rodata, "a", @progbits +#endif .code16 diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 05c27ecaf2a7..b132d3957dfc 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -112,7 +112,7 @@ struct stack_frame { static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long ebp, - struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data) { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)ebp; @@ -149,7 +149,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, - struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data) { unsigned long ebp = 0; @@ -221,7 +221,7 @@ static void print_trace_address(void *data, unsigned long addr) touch_nmi_watchdog(); } -static struct stacktrace_ops print_trace_ops = { +static const struct stacktrace_ops print_trace_ops = { .warning = print_trace_warning, .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, @@ -398,31 +398,24 @@ void die(const char * str, struct pt_regs * regs, long err) local_save_flags(flags); if (++die.lock_owner_depth < 3) { - int nl = 0; unsigned long esp; unsigned short ss; report_bug(regs->eip, regs); - printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, + ++die_counter); #ifdef CONFIG_PREEMPT - printk(KERN_EMERG "PREEMPT "); - nl = 1; + printk("PREEMPT "); #endif #ifdef CONFIG_SMP - if (!nl) - printk(KERN_EMERG); printk("SMP "); - nl = 1; #endif #ifdef CONFIG_DEBUG_PAGEALLOC - if (!nl) - printk(KERN_EMERG); printk("DEBUG_PAGEALLOC"); - nl = 1; #endif - if (nl) - printk("\n"); + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { @@ -1112,20 +1105,6 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); -} -#endif - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index bc7116acf8ff..b4a9b3db1994 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -215,7 +215,7 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, - struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; @@ -336,7 +336,7 @@ static void print_trace_address(void *data, unsigned long addr) printk_address(addr); } -static struct stacktrace_ops print_trace_ops = { +static const struct stacktrace_ops print_trace_ops = { .warning = print_trace_warning, .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index b85ad754f70e..e87a3939ed40 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -349,10 +349,10 @@ __cpuinit int unsynchronized_tsc(void) static void __init check_geode_tsc_reliable(void) { - unsigned long val; + unsigned long res_low, res_high; - rdmsrl(MSR_GEODE_BUSCONT_CONF0, val); - if ((val & RTSC_SUSP)) + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + if (res_low & RTSC_SUSP) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } #else diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 18673e0f193b..f02bad68abaa 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned long eip, unsigned len) { switch (type) { - case PARAVIRT_PATCH(irq_disable): + case PARAVIRT_PATCH(pv_irq_ops.irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, insns, eip); - case PARAVIRT_PATCH(irq_enable): + case PARAVIRT_PATCH(pv_irq_ops.irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, insns, eip); - case PARAVIRT_PATCH(restore_fl): + case PARAVIRT_PATCH(pv_irq_ops.restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, insns, eip); - case PARAVIRT_PATCH(save_fl): + case PARAVIRT_PATCH(pv_irq_ops.save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, insns, eip); - case PARAVIRT_PATCH(iret): + case PARAVIRT_PATCH(pv_cpu_ops.iret): return patch_internal(VMI_CALL_IRET, len, insns, eip); - case PARAVIRT_PATCH(irq_enable_sysexit): + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); default: break; @@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) +static void vmi_enter_lazy_cpu(void) { - static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); - - if (!vmi_ops.set_lazy_mode) - return; + paravirt_enter_lazy_cpu(); + vmi_ops.set_lazy_mode(2); +} - /* Modes should never nest or overlap */ - BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || - mode == PARAVIRT_LAZY_FLUSH)); +static void vmi_enter_lazy_mmu(void) +{ + paravirt_enter_lazy_mmu(); + vmi_ops.set_lazy_mode(1); +} - if (mode == PARAVIRT_LAZY_FLUSH) { - vmi_ops.set_lazy_mode(0); - vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); - } else { - vmi_ops.set_lazy_mode(mode); - __get_cpu_var(lazy_mode) = mode; - } +static void vmi_leave_lazy(void) +{ + paravirt_leave_lazy(paravirt_get_lazy_mode()); + vmi_ops.set_lazy_mode(0); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -690,9 +688,9 @@ do { \ reloc = call_vrom_long_func(vmi_rom, get_reloc, \ VMI_CALL_##vmicall); \ if (rel->type == VMI_RELOCATION_CALL_REL) \ - paravirt_ops.opname = (void *)rel->eip; \ + opname = (void *)rel->eip; \ else if (rel->type == VMI_RELOCATION_NOP) \ - paravirt_ops.opname = (void *)vmi_nop; \ + opname = (void *)vmi_nop; \ else if (rel->type != VMI_RELOCATION_NONE) \ printk(KERN_WARNING "VMI: Unknown relocation " \ "type %d for " #vmicall"\n",\ @@ -712,7 +710,7 @@ do { \ VMI_CALL_##vmicall); \ BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ if (rel->type == VMI_RELOCATION_CALL_REL) { \ - paravirt_ops.opname = wrapper; \ + opname = wrapper; \ vmi_ops.cache = (void *)rel->eip; \ } \ } while (0) @@ -732,11 +730,11 @@ static inline int __init activate_vmi(void) } savesegment(cs, kernel_cs); - paravirt_ops.paravirt_enabled = 1; - paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + pv_info.paravirt_enabled = 1; + pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; + pv_info.name = "vmi"; - paravirt_ops.patch = vmi_patch; - paravirt_ops.name = "vmi"; + pv_init_ops.patch = vmi_patch; /* * Many of these operations are ABI compatible with VMI. @@ -754,26 +752,26 @@ static inline int __init activate_vmi(void) */ /* CPUID is special, so very special it gets wrapped like a present */ - para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); - - para_fill(clts, CLTS); - para_fill(get_debugreg, GetDR); - para_fill(set_debugreg, SetDR); - para_fill(read_cr0, GetCR0); - para_fill(read_cr2, GetCR2); - para_fill(read_cr3, GetCR3); - para_fill(read_cr4, GetCR4); - para_fill(write_cr0, SetCR0); - para_fill(write_cr2, SetCR2); - para_fill(write_cr3, SetCR3); - para_fill(write_cr4, SetCR4); - para_fill(save_fl, GetInterruptMask); - para_fill(restore_fl, SetInterruptMask); - para_fill(irq_disable, DisableInterrupts); - para_fill(irq_enable, EnableInterrupts); - - para_fill(wbinvd, WBINVD); - para_fill(read_tsc, RDTSC); + para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID); + + para_fill(pv_cpu_ops.clts, CLTS); + para_fill(pv_cpu_ops.get_debugreg, GetDR); + para_fill(pv_cpu_ops.set_debugreg, SetDR); + para_fill(pv_cpu_ops.read_cr0, GetCR0); + para_fill(pv_mmu_ops.read_cr2, GetCR2); + para_fill(pv_mmu_ops.read_cr3, GetCR3); + para_fill(pv_cpu_ops.read_cr4, GetCR4); + para_fill(pv_cpu_ops.write_cr0, SetCR0); + para_fill(pv_mmu_ops.write_cr2, SetCR2); + para_fill(pv_mmu_ops.write_cr3, SetCR3); + para_fill(pv_cpu_ops.write_cr4, SetCR4); + para_fill(pv_irq_ops.save_fl, GetInterruptMask); + para_fill(pv_irq_ops.restore_fl, SetInterruptMask); + para_fill(pv_irq_ops.irq_disable, DisableInterrupts); + para_fill(pv_irq_ops.irq_enable, EnableInterrupts); + + para_fill(pv_cpu_ops.wbinvd, WBINVD); + para_fill(pv_cpu_ops.read_tsc, RDTSC); /* The following we emulate with trap and emulate for now */ /* paravirt_ops.read_msr = vmi_rdmsr */ @@ -781,29 +779,38 @@ static inline int __init activate_vmi(void) /* paravirt_ops.rdpmc = vmi_rdpmc */ /* TR interface doesn't pass TR value, wrap */ - para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); + para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR); /* LDT is special, too */ - para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); - - para_fill(load_gdt, SetGDT); - para_fill(load_idt, SetIDT); - para_fill(store_gdt, GetGDT); - para_fill(store_idt, GetIDT); - para_fill(store_tr, GetTR); - paravirt_ops.load_tls = vmi_load_tls; - para_fill(write_ldt_entry, WriteLDTEntry); - para_fill(write_gdt_entry, WriteGDTEntry); - para_fill(write_idt_entry, WriteIDTEntry); - para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); - para_fill(set_iopl_mask, SetIOPLMask); - para_fill(io_delay, IODelay); - para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); + para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT); + + para_fill(pv_cpu_ops.load_gdt, SetGDT); + para_fill(pv_cpu_ops.load_idt, SetIDT); + para_fill(pv_cpu_ops.store_gdt, GetGDT); + para_fill(pv_cpu_ops.store_idt, GetIDT); + para_fill(pv_cpu_ops.store_tr, GetTR); + pv_cpu_ops.load_tls = vmi_load_tls; + para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); + para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); + para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); + para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); + para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); + para_fill(pv_cpu_ops.io_delay, IODelay); + + para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + set_lazy_mode, SetLazyMode); + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + set_lazy_mode, SetLazyMode); + + para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, + set_lazy_mode, SetLazyMode); + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ - para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); - para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); - para_fill(flush_tlb_single, InvalPage); + para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); + para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); + para_fill(pv_mmu_ops.flush_tlb_single, InvalPage); /* * Until a standard flag format can be agreed on, we need to @@ -819,41 +826,41 @@ static inline int __init activate_vmi(void) #endif if (vmi_ops.set_pte) { - paravirt_ops.set_pte = vmi_set_pte; - paravirt_ops.set_pte_at = vmi_set_pte_at; - paravirt_ops.set_pmd = vmi_set_pmd; + pv_mmu_ops.set_pte = vmi_set_pte; + pv_mmu_ops.set_pte_at = vmi_set_pte_at; + pv_mmu_ops.set_pmd = vmi_set_pmd; #ifdef CONFIG_X86_PAE - paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; - paravirt_ops.set_pte_present = vmi_set_pte_present; - paravirt_ops.set_pud = vmi_set_pud; - paravirt_ops.pte_clear = vmi_pte_clear; - paravirt_ops.pmd_clear = vmi_pmd_clear; + pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic; + pv_mmu_ops.set_pte_present = vmi_set_pte_present; + pv_mmu_ops.set_pud = vmi_set_pud; + pv_mmu_ops.pte_clear = vmi_pte_clear; + pv_mmu_ops.pmd_clear = vmi_pmd_clear; #endif } if (vmi_ops.update_pte) { - paravirt_ops.pte_update = vmi_update_pte; - paravirt_ops.pte_update_defer = vmi_update_pte_defer; + pv_mmu_ops.pte_update = vmi_update_pte; + pv_mmu_ops.pte_update_defer = vmi_update_pte_defer; } vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); if (vmi_ops.allocate_page) { - paravirt_ops.alloc_pt = vmi_allocate_pt; - paravirt_ops.alloc_pd = vmi_allocate_pd; - paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; + pv_mmu_ops.alloc_pt = vmi_allocate_pt; + pv_mmu_ops.alloc_pd = vmi_allocate_pd; + pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; } vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); if (vmi_ops.release_page) { - paravirt_ops.release_pt = vmi_release_pt; - paravirt_ops.release_pd = vmi_release_pd; + pv_mmu_ops.release_pt = vmi_release_pt; + pv_mmu_ops.release_pd = vmi_release_pd; } /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); #ifdef CONFIG_HIGHPTE if (vmi_ops.set_linear_mapping) - paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; + pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; #endif /* @@ -863,17 +870,17 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; - paravirt_ops.iret = (void *)0xbadbab0; + pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; + pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP - para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); + para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(apic_read, APICRead); - para_fill(apic_write, APICWrite); - para_fill(apic_write_atomic, APICWrite); + para_fill(pv_apic_ops.apic_read, APICRead); + para_fill(pv_apic_ops.apic_write, APICWrite); + para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* @@ -891,15 +898,15 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - paravirt_ops.time_init = vmi_time_init; - paravirt_ops.get_wallclock = vmi_get_wallclock; - paravirt_ops.set_wallclock = vmi_set_wallclock; + pv_time_ops.time_init = vmi_time_init; + pv_time_ops.get_wallclock = vmi_get_wallclock; + pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - paravirt_ops.setup_boot_clock = vmi_time_bsp_init; - paravirt_ops.setup_secondary_clock = vmi_time_ap_init; + pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; + pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; #endif - paravirt_ops.sched_clock = vmi_sched_clock; - paravirt_ops.get_cpu_khz = vmi_cpu_khz; + pv_time_ops.sched_clock = vmi_sched_clock; + pv_time_ops.get_cpu_khz = vmi_cpu_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; @@ -908,7 +915,7 @@ static inline int __init activate_vmi(void) disable_vmi_timer = 1; } - para_fill(safe_halt, Halt); + para_fill(pv_irq_ops.safe_halt, Halt); /* * Alternative instruction rewriting doesn't happen soon enough diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 93847d848157..585541ca1a7e 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -64,6 +64,16 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = .sysctl_enabled = 1, }; +void update_vsyscall_tz(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* sys_tz has changed */ + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} + void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { unsigned long flags; @@ -77,8 +87,6 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.sys_tz = sys_tz; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -164,7 +172,7 @@ time_t __vsyscall(1) vtime(time_t *t) if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, 0); + vgettimeofday(&tv, NULL); result = tv.tv_sec; if (t) *t = result; @@ -258,18 +266,10 @@ out: return ret; } -static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - static ctl_table kernel_table2[] = { - { .ctl_name = 99, .procname = "vsyscall64", + { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, {} }; @@ -289,7 +289,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu) unsigned long *d; unsigned long node = 0; #ifdef CONFIG_NUMA - node = cpu_to_node[cpu]; + node = cpu_to_node(cpu); #endif if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c index 24676609a6ac..7445caf1b5de 100644 --- a/arch/x86/lib/bitstr_64.c +++ b/arch/x86/lib/bitstr_64.c @@ -14,7 +14,7 @@ find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len) /* could test bitsliced, but it's hardly worth it */ end = n+len; - if (end >= nbits) + if (end > nbits) return -1; for (i = n+1; i < end; i++) { if (test_bit(i, bitmap)) { diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c index 7767962f25d3..57d043fa893e 100644 --- a/arch/x86/lib/msr-on-cpu.c +++ b/arch/x86/lib/msr-on-cpu.c @@ -26,27 +26,18 @@ static void __rdmsr_safe_on_cpu(void *info) static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) { int err = 0; - preempt_disable(); - if (smp_processor_id() == cpu) - if (safe) - err = rdmsr_safe(msr_no, l, h); - else - rdmsr(msr_no, *l, *h); - else { - struct msr_info rv; - - rv.msr_no = msr_no; - if (safe) { - smp_call_function_single(cpu, __rdmsr_safe_on_cpu, - &rv, 0, 1); - err = rv.err; - } else { - smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1); - } - *l = rv.l; - *h = rv.h; + struct msr_info rv; + + rv.msr_no = msr_no; + if (safe) { + smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1); + err = rv.err; + } else { + smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1); } - preempt_enable(); + *l = rv.l; + *h = rv.h; + return err; } @@ -67,27 +58,18 @@ static void __wrmsr_safe_on_cpu(void *info) static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) { int err = 0; - preempt_disable(); - if (smp_processor_id() == cpu) - if (safe) - err = wrmsr_safe(msr_no, l, h); - else - wrmsr(msr_no, l, h); - else { - struct msr_info rv; - - rv.msr_no = msr_no; - rv.l = l; - rv.h = h; - if (safe) { - smp_call_function_single(cpu, __wrmsr_safe_on_cpu, - &rv, 0, 1); - err = rv.err; - } else { - smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1); - } + struct msr_info rv; + + rv.msr_no = msr_no; + rv.l = l; + rv.h = h; + if (safe) { + smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1); + err = rv.err; + } else { + smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1); } - preempt_enable(); + return err; } diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S index 0cde1f807314..05ea55f71405 100644 --- a/arch/x86/lib/rwlock_64.S +++ b/arch/x86/lib/rwlock_64.S @@ -2,7 +2,7 @@ #include <linux/linkage.h> #include <asm/rwlock.h> -#include <asm/alternative-asm.i> +#include <asm/alternative-asm.h> #include <asm/dwarf2.h> /* rdi: pointer to rwlock_t */ diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index c01eb39c0b43..444fba400983 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -15,8 +15,8 @@ #include <linux/linkage.h> #include <asm/rwlock.h> -#include <asm/alternative-asm.i> -#include <asm/frame.i> +#include <asm/alternative-asm.h> +#include <asm/frame.h> #include <asm/dwarf2.h> /* diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 2c773fefa3dd..c2c0504a3071 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -160,26 +160,6 @@ char *strchr(const char * s, int c) EXPORT_SYMBOL(strchr); #endif -#ifdef __HAVE_ARCH_STRRCHR -char *strrchr(const char * s, int c) -{ - int d0, d1; - char * res; - asm volatile( "movb %%al,%%ah\n" - "1:\tlodsb\n\t" - "cmpb %%ah,%%al\n\t" - "jne 2f\n\t" - "leal -1(%%esi),%0\n" - "2:\ttestb %%al,%%al\n\t" - "jne 1b" - :"=g" (res), "=&S" (d0), "=&a" (d1) - :"0" (0),"1" (s),"2" (c) - :"memory"); - return res; -} -EXPORT_SYMBOL(strrchr); -#endif - #ifdef __HAVE_ARCH_STRLEN size_t strlen(const char * s) { diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 55e586d352d3..6ea73f3de567 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -50,6 +50,10 @@ thunk trace_hardirqs_on_thunk,trace_hardirqs_on thunk trace_hardirqs_off_thunk,trace_hardirqs_off #endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + thunk lockdep_sys_exit_thunk,lockdep_sys_exit +#endif /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 7f635c7a2381..3f08010f3517 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -35,7 +35,11 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; /** * intr_init_hook - post gate setup interrupt initialisation @@ -159,16 +163,18 @@ char * __init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { + sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) + < 0) { unsigned long mem_size; /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; + if (boot_params.alt_mem_k + < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { - mem_size = ALT_MEM_K; + mem_size = boot_params.alt_mem_k; who = "BIOS-e801"; } diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c index ab99072d3f9a..f5d6f7d8b86e 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/mach-es7000/es7000plat.c @@ -46,11 +46,11 @@ * ES7000 Globals */ -volatile unsigned long *psai = NULL; -struct mip_reg *mip_reg; -struct mip_reg *host_reg; -int mip_port; -unsigned long mip_addr, host_addr; +static volatile unsigned long *psai = NULL; +static struct mip_reg *mip_reg; +static struct mip_reg *host_reg; +static int mip_port; +static unsigned long mip_addr, host_addr; /* * GSI override for ES7000 platforms. @@ -288,28 +288,8 @@ es7000_start_cpu(int cpu, unsigned long eip) } -int -es7000_stop_cpu(int cpu) -{ - int startup; - - if (psai == NULL) - return -1; - - startup= (0x1000000 | cpu); - - while ((*psai & 0xff00ffff) != startup) - ; - - startup = (*psai & 0xff0000) >> 16; - *psai &= 0xffffff; - - return 0; - -} - void __init -es7000_sw_apic() +es7000_sw_apic(void) { if (es7000_plat) { int mip_status; diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 74f3da634423..4121d1551800 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -22,7 +22,7 @@ extern struct genapic apic_default; struct genapic *genapic = &apic_default; -struct genapic *apic_probe[] __initdata = { +static struct genapic *apic_probe[] __initdata = { &apic_summit, &apic_bigsmp, &apic_es7000, diff --git a/arch/x86/mach-visws/setup.c b/arch/x86/mach-visws/setup.c index 1f81f10e03a0..de4c9dbd086f 100644 --- a/arch/x86/mach-visws/setup.c +++ b/arch/x86/mach-visws/setup.c @@ -152,7 +152,7 @@ char * __init machine_specific_memory_setup(void) { long long gfx_mem_size = 8 * MB; - mem_size = ALT_MEM_K; + mem_size = boot_params.alt_mem_k; if (!mem_size) { printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 2b55694e6400..3bef977cb29b 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -18,7 +18,11 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; void __init intr_init_hook(void) { @@ -83,7 +87,7 @@ char * __init machine_specific_memory_setup(void) if(inb(catbase) != VOYAGER_DINO) { printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); - tom = (EXT_MEM_K)<<10; + tom = (boot_params.screen_info.ext_mem_k)<<10; } who = "Voyager-TOM"; add_memory_region(0, 0x9f000, E820_RAM); @@ -104,16 +108,18 @@ char * __init machine_specific_memory_setup(void) * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { + sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) + < 0) { unsigned long mem_size; /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; + if (boot_params.alt_mem_k + < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { - mem_size = ALT_MEM_K; + mem_size = boot_params.alt_mem_k; who = "BIOS-e801"; } diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index b87f8548e75a..e4928aa6bdfb 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -442,8 +442,8 @@ static __u32 __init setup_trampoline(void) { /* these two are global symbols in trampoline.S */ - extern __u8 trampoline_end[]; - extern __u8 trampoline_data[]; + extern const __u8 trampoline_end[]; + extern const __u8 trampoline_data[]; memcpy((__u8 *)trampoline_base, trampoline_data, trampoline_end - trampoline_data); @@ -1037,6 +1037,7 @@ smp_call_function_interrupt(void) */ irq_enter(); (*func)(info); + __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); if (wait) { mb(); diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile index 9c943fa6ce6b..9b0c63b60302 100644 --- a/arch/x86/math-emu/Makefile +++ b/arch/x86/math-emu/Makefile @@ -5,8 +5,7 @@ #DEBUG = -DDEBUGGING DEBUG = PARANOID = -DPARANOID -CFLAGS := $(CFLAGS) $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) - +EXTRA_CFLAGS := $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION) EXTRA_AFLAGS := $(PARANOID) # From 'C' language sources: diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 860e912a3fbb..13893772cc48 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -103,14 +103,14 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; -unsigned long node_remap_offset[MAX_NUMNODES]; -void *node_remap_start_vaddr[MAX_NUMNODES]; +static unsigned long node_remap_offset[MAX_NUMNODES]; +static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -void *node_remap_end_vaddr[MAX_NUMNODES]; -void *node_remap_alloc_vaddr[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; static unsigned long kva_start_pfn; static unsigned long kva_pages; /* @@ -288,8 +288,9 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD /* Numa kva area is below the initrd */ - if (LOADER_TYPE && INITRD_START) - kva_start_pfn = PFN_DOWN(INITRD_START) - kva_pages; + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) + kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image) + - kva_pages; #endif kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c index fcb38e7f3543..6555c3d14371 100644 --- a/arch/x86/mm/fault_32.c +++ b/arch/x86/mm/fault_32.c @@ -25,6 +25,7 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> +#include <linux/kprobes.h> #include <asm/system.h> #include <asm/desc.h> @@ -32,33 +33,27 @@ extern void die(const char *,struct pt_regs *,long); -static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); - -int register_page_fault_notifier(struct notifier_block *nb) +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs) { - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(register_page_fault_notifier); + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode_vm(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); + return ret; } -EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); - -static inline int notify_page_fault(struct pt_regs *regs, long err) +#else +static inline int notify_page_fault(struct pt_regs *regs) { - struct die_args args = { - .regs = regs, - .str = "page fault", - .err = err, - .trapnr = 14, - .signr = SIGSEGV - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, - DIE_PAGE_FAULT, &args); + return 0; } +#endif /* * Return EIP plus the CS segment base. The segment limit is also @@ -110,7 +105,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs, LDT and other horrors are only used in user space. */ if (seg & (1<<2)) { /* Must lock the LDT while reading it. */ - down(¤t->mm->context.sem); + mutex_lock(¤t->mm->context.lock); desc = current->mm->context.ldt; desc = (void *)desc + (seg & ~7); } else { @@ -123,7 +118,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs, base = get_desc_base((unsigned long *)desc); if (seg & (1<<2)) { - up(¤t->mm->context.sem); + mutex_unlock(¤t->mm->context.lock); } else put_cpu(); @@ -331,7 +326,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(address >= TASK_SIZE)) { if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) return; - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -340,7 +335,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; /* It's safe to allow irq's after cr2 has been saved and the vmalloc @@ -544,23 +539,22 @@ no_context: printk(KERN_ALERT "BUG: unable to handle kernel paging" " request"); printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT " printing eip:\n"); - printk("%08lx\n", regs->eip); + printk(KERN_ALERT "printing eip: %08lx ", regs->eip); page = read_cr3(); page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; #ifdef CONFIG_X86_PAE - printk(KERN_ALERT "*pdpt = %016Lx\n", page); + printk("*pdpt = %016Lx ", page); if ((page >> PAGE_SHIFT) < max_low_pfn && page & _PAGE_PRESENT) { page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) & (PTRS_PER_PMD - 1)]; - printk(KERN_ALERT "*pde = %016Lx\n", page); + printk(KERN_ALERT "*pde = %016Lx ", page); page &= ~_PAGE_NX; } #else - printk(KERN_ALERT "*pde = %08lx\n", page); + printk("*pde = %08lx ", page); #endif /* @@ -574,8 +568,10 @@ no_context: page &= PAGE_MASK; page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)]; - printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page); + printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); } + + printk("\n"); } tsk->thread.cr2 = address; @@ -598,7 +594,7 @@ out_of_memory: } printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + do_group_exit(SIGKILL); goto no_context; do_sigbus: diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c index 54816adb8e93..5e0e54906c48 100644 --- a/arch/x86/mm/fault_64.c +++ b/arch/x86/mm/fault_64.c @@ -25,6 +25,7 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> +#include <linux/kprobes.h> #include <asm/system.h> #include <asm/pgalloc.h> @@ -40,34 +41,27 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); - -/* Hook to register for page fault notifications */ -int register_page_fault_notifier(struct notifier_block *nb) +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs) { - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(register_page_fault_notifier); + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); + return ret; } -EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); - -static inline int notify_page_fault(struct pt_regs *regs, long err) +#else +static inline int notify_page_fault(struct pt_regs *regs) { - struct die_args args = { - .regs = regs, - .str = "page fault", - .err = err, - .trapnr = 14, - .signr = SIGSEGV - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, - DIE_PAGE_FAULT, &args); + return 0; } +#endif /* Sometimes the CPU reports invalid exceptions on prefetch. Check that here and ignore. @@ -345,7 +339,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -354,7 +348,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; if (likely(regs->eflags & X86_EFLAGS_IF)) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 730a5b177b1f..c7d19471261d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -85,13 +85,20 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) static pte_t * __init one_page_table_init(pmd_t *pmd) { if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pte_t *page_table = NULL; + +#ifdef CONFIG_DEBUG_PAGEALLOC + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); +#endif + if (!page_table) + page_table = + (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } - + return pte_offset_kernel(pmd, 0); } @@ -735,35 +742,18 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(zone, start_pfn, nr_pages); } -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(remove_memory); #endif struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { - size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); - - if (PTRS_PER_PMD > 1) { + if (PTRS_PER_PMD > 1) pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); - if (!SHARED_KERNEL_PMD) { - /* If we're in PAE mode and have a non-shared - kernel pmd, then the pgd size must be a - page size. This is because the pgd_list - links through the page structure, so there - can only be one pgd per page for this to - work. */ - pgd_size = PAGE_SIZE; - } - } + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + SLAB_PANIC, + pmd_ctor); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 458893b376f8..1e3862e41065 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -474,12 +474,6 @@ error: } EXPORT_SYMBOL_GPL(arch_add_memory); -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(remove_memory); - #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) int memory_add_physaddr_to_nid(u64 start) { @@ -748,3 +742,48 @@ const char *arch_vma_name(struct vm_area_struct *vma) return "[vsyscall]"; return NULL; } + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Initialise the sparsemem vmemmap using huge-pages at the PMD level. + */ +int __meminit vmemmap_populate(struct page *start_page, + unsigned long size, int node) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + void *p = vmemmap_alloc_block(PMD_SIZE, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + mk_pte_huge(entry); + set_pmd(pmd, __pmd(pte_val(entry))); + + printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", + addr, addr + PMD_SIZE - 1, p, node); + } else + vmemmap_verify((pte_t *)pmd, node, addr, next); + } + + return 0; +} +#endif diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 6da235522269..5eec5e56d07f 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -166,7 +166,7 @@ early_node_mem(int nodeid, unsigned long start, unsigned long end, return __va(mem); ptr = __alloc_bootmem_nopanic(size, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); - if (ptr == 0) { + if (ptr == NULL) { printk(KERN_ERR "Cannot find %lu bytes in node %d\n", size, nodeid); return NULL; @@ -261,7 +261,7 @@ void __init numa_init_array(void) We round robin the existing nodes. */ rr = first_node(node_online_map); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] != NUMA_NO_NODE) + if (cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); rr = next_node(rr, node_online_map); @@ -543,7 +543,7 @@ __cpuinit void numa_add_cpu(int cpu) void __cpuinit numa_set_node(int cpu, int node) { cpu_pda(cpu)->nodenumber = node; - cpu_to_node[cpu] = node; + cpu_to_node(cpu) = node; } unsigned long __init numa_free_all_bootmem(void) diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c index 4241a74d16c8..260073c07600 100644 --- a/arch/x86/mm/pageattr_32.c +++ b/arch/x86/mm/pageattr_32.c @@ -70,10 +70,10 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, static void cache_flush_page(struct page *p) { - unsigned long adr = (unsigned long)page_address(p); + void *adr = page_address(p); int i; for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (adr + i)); + clflush(adr+i); } static void flush_kernel_map(void *arg) diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c index 10b9809ce821..8a4f65bf956e 100644 --- a/arch/x86/mm/pageattr_64.c +++ b/arch/x86/mm/pageattr_64.c @@ -65,7 +65,7 @@ static void cache_flush_page(void *adr) { int i; for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (adr + i)); + clflush(adr+i); } static void flush_kernel_map(void *arg) @@ -148,6 +148,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; + pgprot_val(ref_prot2) &= ~_PAGE_NX; set_pte(kpte, mk_pte(split, ref_prot2)); kpte_page = split; } diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 01437c46baae..be61a1d845a4 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -6,6 +6,7 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> +#include <linux/nmi.h> #include <linux/swap.h> #include <linux/smp.h> #include <linux/highmem.h> @@ -39,6 +40,8 @@ void show_mem(void) for_each_online_pgdat(pgdat) { pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) + touch_nmi_watchdog(); page = pgdat_page_nr(pgdat, i); total++; if (PageHighMem(page)) @@ -97,8 +100,7 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) } pte = pte_offset_kernel(pmd, vaddr); if (pgprot_val(flags)) - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte(pfn, flags)); + set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); else pte_clear(&init_mm, vaddr, pte); @@ -193,7 +195,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags) +void pmd_ctor(struct kmem_cache *cache, void *pmd) { memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index acdf03e19146..56089ccc3949 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -431,9 +431,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) setup_node_bootmem(i, nodes[i].start, nodes[i].end); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node[i] == NUMA_NO_NODE) + if (cpu_to_node(i) == NUMA_NO_NODE) continue; - if (!node_isset(cpu_to_node[i], node_possible_map)) + if (!node_isset(cpu_to_node(i), node_possible_map)) numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 11b7a51566a8..2d0eeac7251f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -269,7 +269,6 @@ static void nmi_cpu_shutdown(void * dummy) apic_write(APIC_LVTPC, saved_lvtpc[cpu]); apic_write(APIC_LVTERR, v); nmi_restore_registers(msrs); - model->shutdown(msrs); } @@ -278,6 +277,7 @@ static void nmi_shutdown(void) nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1); unregister_die_notifier(&profile_exceptions_nb); + model->shutdown(cpu_msrs); free_msrs(); } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 47925927b12f..56b4757a1f47 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -379,7 +379,7 @@ static unsigned int get_stagger(void) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - return (cpu != first_cpu(cpu_sibling_map[cpu])); + return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); #endif return 0; } diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 2d71bbc411d2..f4386990b150 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -289,6 +289,22 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL685c G1"), }, }, + { + .callback = set_bf_sort, + .ident = "HP ProLiant DL385 G2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"), + }, + }, + { + .callback = set_bf_sort, + .ident = "HP ProLiant DL585 G2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"), + }, + }, #ifdef __i386__ { .callback = assign_all_busses, diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 8d03de029d9b..7a2ba4583939 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -13,7 +13,7 @@ vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so -targets += vdso.so vdso.lds $(vobjs-y) vdso-syms.o +targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) vdso-syms.o # The DSO images are built using a special linker script. quiet_cmd_syscall = SYSCALL $@ @@ -26,16 +26,23 @@ vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ $(call ld-option, -Wl$(comma)--hash-style=sysv) \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 SYSCFLAGS_vdso.so = $(vdso-flags) +SYSCFLAGS_vdso.so.dbg = $(vdso-flags) $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so $(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE + +$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,syscall) +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 -$(obj)/vclock_gettime.o: CFLAGS = $(CFL) -$(obj)/vgetcpu.o: CFLAGS = $(CFL) +$(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL) +$(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL) # We also create a special relocatable object that should mirror the symbol # table and layout of the linked DSO. With ld -R we can then refer to @@ -47,3 +54,11 @@ $(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o SYSCFLAGS_vdso-syms.o = -r -d $(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,syscall) + +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ +vdso.so: + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso.so diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index b9a60e665d08..667d3245d972 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -26,13 +26,16 @@ SECTIONS is insufficient, ld -shared will barf. Just increase it here. */ . = VDSO_PRELINK + VDSO_TEXT_OFFSET; - .text : { *(.text) } :text - .text.ptr : { *(.text.ptr) } :text - . = VDSO_PRELINK + 0x900; - .data : { *(.data) } :text - .bss : { *(.bss) } :text + .text : { *(.text*) } :text + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.bss*) + *(.dynbss*) + } :text - .altinstructions : { *(.altinstructions) } :text + .altinstructions : { *(.altinstructions) } :text .altinstr_replacement : { *(.altinstr_replacement) } :text .note : { *(.note.*) } :text :note @@ -42,7 +45,6 @@ SECTIONS .useless : { *(.got.plt) *(.got) *(.gnu.linkonce.d.*) - *(.dynbss) *(.gnu.linkonce.b.*) } :text } diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c index 6fc22219a472..1b7e703684f9 100644 --- a/arch/x86/vdso/vvar.c +++ b/arch/x86/vdso/vvar.c @@ -8,5 +8,5 @@ #include <asm/timex.h> #include <asm/vgtod.h> -#define VEXTERN(x) typeof (__ ## x) *vdso_ ## x = (void *)VMAGIC; +#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC; #include "vextern.h" diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f01bfcd4bdee..94c39aaf695f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -25,7 +25,6 @@ #include <linux/mm.h> #include <linux/page-flags.h> #include <linux/highmem.h> -#include <linux/smp.h> #include <xen/interface/xen.h> #include <xen/interface/physdev.h> @@ -52,11 +51,25 @@ EXPORT_SYMBOL_GPL(hypercall_page); -DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); - DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); -DEFINE_PER_CPU(unsigned long, xen_cr3); + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -100,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu) info.mfn = virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", + printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", cpu, vcpup, info.mfn, info.offset); /* Check to see if the hypervisor will put the vcpu_info @@ -124,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu) static void __init xen_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); + pv_info.name); printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); } @@ -249,29 +262,10 @@ static void xen_halt(void) xen_safe_halt(); } -static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +static void xen_leave_lazy(void) { - BUG_ON(preemptible()); - - switch (mode) { - case PARAVIRT_LAZY_NONE: - BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_MMU: - case PARAVIRT_LAZY_CPU: - BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_FLUSH: - /* flush if necessary, but don't change state */ - if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) - xen_mc_flush(); - return; - } - + paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); - x86_write_percpu(xen_lazy_mode, mode); } static unsigned long xen_store_tr(void) @@ -358,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. */ - if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) loadsegment(gs, 0); } @@ -632,32 +626,36 @@ static unsigned long xen_read_cr3(void) return x86_read_percpu(xen_cr3); } +static void set_current_cr3(void *v) +{ + x86_write_percpu(xen_current_cr3, (unsigned long)v); +} + static void xen_write_cr3(unsigned long cr3) { + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + BUG_ON(preemptible()); - if (cr3 == x86_read_percpu(xen_cr3)) { - /* just a simple tlb flush */ - xen_flush_tlb(); - return; - } + mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ x86_write_percpu(xen_cr3, cr3); + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; - { - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); - - op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = mfn; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + /* Update xen_update_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); - xen_mc_issue(PARAVIRT_LAZY_CPU); - } + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } /* Early in boot, while setting up the initial pagetable, assume @@ -668,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } +static void pin_pagetable_pfn(unsigned level, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = level; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) @@ -677,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - if (!PageHighMem(page)) + if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - else + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else /* make sure there are no stray mappings of this page */ kmap_flush_unused(); @@ -692,8 +700,10 @@ static void xen_release_pt(u32 pfn) struct page *page = pfn_to_page(pfn); if (PagePinned(page)) { - if (!PageHighMem(page)) + if (!PageHighMem(page)) { + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } } } @@ -738,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; /* special set_pte for pagetable initialization */ - paravirt_ops.set_pte = xen_set_pte_init; + pv_mmu_ops.set_pte = xen_set_pte_init; init_mm.pgd = base; /* @@ -785,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base) { /* This will work as long as patching hasn't happened yet (which it hasn't) */ - paravirt_ops.alloc_pt = xen_alloc_pt; - paravirt_ops.set_pte = xen_set_pte; + pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.set_pte = xen_set_pte; if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* @@ -808,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ { - struct mmuext_op op; + unsigned level; + #ifdef CONFIG_X86_PAE - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op.cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + + pin_pagetable_pfn(level, PFN_DOWN(__pa(base))); } } @@ -833,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void) if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); - paravirt_ops.save_fl = xen_save_fl_direct; - paravirt_ops.restore_fl = xen_restore_fl_direct; - paravirt_ops.irq_disable = xen_irq_disable_direct; - paravirt_ops.irq_enable = xen_irq_enable_direct; - paravirt_ops.read_cr2 = xen_read_cr2_direct; - paravirt_ops.iret = xen_iret_direct; + pv_irq_ops.save_fl = xen_save_fl_direct; + pv_irq_ops.restore_fl = xen_restore_fl_direct; + pv_irq_ops.irq_disable = xen_irq_disable_direct; + pv_irq_ops.irq_enable = xen_irq_enable_direct; + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; + pv_cpu_ops.iret = xen_iret_direct; } } @@ -850,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, start = end = reloc = NULL; -#define SITE(x) \ - case PARAVIRT_PATCH(x): \ +#define SITE(op, x) \ + case PARAVIRT_PATCH(op.x): \ if (have_vcpu_info_placement) { \ start = (char *)xen_##x##_direct; \ end = xen_##x##_direct_end; \ @@ -860,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { - SITE(irq_enable); - SITE(irq_disable); - SITE(save_fl); - SITE(restore_fl); + SITE(pv_irq_ops, irq_enable); + SITE(pv_irq_ops, irq_disable); + SITE(pv_irq_ops, save_fl); + SITE(pv_irq_ops, restore_fl); #undef SITE patch_site: @@ -895,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static const struct paravirt_ops xen_paravirt_ops __initdata = { +static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, .name = "Xen", - .banner = xen_banner, +}; +static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, + .banner = xen_banner, .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, - .init_IRQ = xen_init_IRQ, .post_allocator_init = xen_mark_init_mm_pinned, +}; +static const struct pv_time_ops xen_time_ops __initdata = { .time_init = xen_time_init, + .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_cpu_khz = xen_cpu_khz, .sched_clock = xen_sched_clock, +}; +static const struct pv_cpu_ops xen_cpu_ops __initdata = { .cpuid = xen_cpuid, .set_debugreg = xen_set_debugreg, @@ -925,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, @@ -969,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + .lazy_mode = { + .enter = paravirt_enter_lazy_cpu, + .leave = xen_leave_lazy, + }, +}; + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +}; + +static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, .apic_write_atomic = xen_apic_write, @@ -977,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif +}; + +static const struct pv_mmu_ops xen_mmu_ops __initdata = { + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, @@ -986,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - .alloc_pt = xen_alloc_pt_init, .release_pt = xen_release_pt, .alloc_pd = paravirt_nop, @@ -1024,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, - .set_lazy_mode = xen_set_lazy_mode, + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy, + }, }; #ifdef CONFIG_SMP @@ -1080,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = { }; +static void __init xen_reserve_top(void) +{ + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top + 2 * PAGE_SIZE); +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1091,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void) BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); /* Install Xen paravirt ops */ - paravirt_ops = xen_paravirt_ops; + pv_info = xen_info; + pv_init_ops = xen_init_ops; + pv_time_ops = xen_time_ops; + pv_cpu_ops = xen_cpu_ops; + pv_irq_ops = xen_irq_ops; + pv_apic_ops = xen_apic_ops; + pv_mmu_ops = xen_mmu_ops; + machine_ops = xen_machine_ops; #ifdef CONFIG_SMP @@ -1113,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void) /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); + x86_write_percpu(xen_current_cr3, __pa(pgd)); #ifdef CONFIG_SMP /* Don't do the full vcpu_info placement stuff until we have a @@ -1124,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void) xen_setup_vcpu_info_placement(); #endif - paravirt_ops.kernel_rpl = 1; + pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) - paravirt_ops.kernel_rpl = 0; + pv_info.kernel_rpl = 0; /* set the limit of our address space */ - reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + xen_reserve_top(); /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); @@ -1137,9 +1188,10 @@ asmlinkage void __init xen_start_kernel(void) new_cpu_data.x86_capability[0] = cpuid_edx(1); /* Poke various useful things into boot_params */ - LOADER_TYPE = (9 << 4) | 0; - INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; - INITRD_SIZE = xen_start_info->mod_len; + boot_params.hdr.type_of_loader = (9 << 4) | 0; + boot_params.hdr.ramdisk_image = xen_start_info->mod_start + ? __pa(xen_start_info->mod_start) : 0; + boot_params.hdr.ramdisk_size = xen_start_info->mod_len; /* Start the world */ start_kernel(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 874db0cd1d2a..b2e32f9d0071 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -41,7 +41,6 @@ #include <linux/sched.h> #include <linux/highmem.h> #include <linux/bug.h> -#include <linux/sched.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -155,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { if (mm == current->mm || mm == &init_mm) { - if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); @@ -304,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd) } #endif /* CONFIG_X86_PAE */ - +enum pt_level { + PT_PGD, + PT_PUD, + PT_PMD, + PT_PTE +}; /* (Yet another) pagetable walker. This one is intended for pinning a @@ -316,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd) FIXADDR_TOP. But the important bit is that we don't pin beyond there, because then we start getting into Xen's ptes. */ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), +static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), unsigned long limit) { pgd_t *pgd = pgd_base; @@ -341,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), pud = pud_offset(pgd, 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), 0); + flush |= (*func)(virt_to_page(pud), PT_PUD); for (; addr != pud_limit; pud++, addr = pud_next) { pmd_t *pmd; @@ -360,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), pmd = pmd_offset(pud, 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), 0); + flush |= (*func)(virt_to_page(pmd), PT_PMD); for (; addr != pmd_limit; pmd++) { addr += (PAGE_SIZE * PTRS_PER_PTE); @@ -372,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), if (pmd_none(*pmd)) continue; - flush |= (*func)(pmd_page(*pmd), 0); + flush |= (*func)(pmd_page(*pmd), PT_PTE); } } } - flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); + flush |= (*func)(virt_to_page(pgd_base), PT_PGD); return flush; } -static int pin_page(struct page *page, unsigned flags) +static spinlock_t *lock_pte(struct page *page) +{ + spinlock_t *ptl = NULL; + +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + ptl = __pte_lockptr(page); + spin_lock(ptl); +#endif + + return ptl; +} + +static void do_unlock(void *v) +{ + spinlock_t *ptl = v; + spin_unlock(ptl); +} + +static void xen_do_pin(unsigned level, unsigned long pfn) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + mcs = __xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = level; + op->arg1.mfn = pfn_to_mfn(pfn); + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); +} + +static int pin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); int flush; @@ -397,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags) void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl; flush = 0; + ptl = NULL; + if (level == PT_PTE) + ptl = lock_pte(page); + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (level == PT_PTE) + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); + + if (ptl) { + /* Queue a deferred unlock for when this batch + is completed. */ + xen_mc_callback(do_unlock, ptl); + } } return flush; @@ -413,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags) read-only, and can be pinned. */ void xen_pgd_pin(pgd_t *pgd) { - struct multicall_space mcs; - struct mmuext_op *op; + unsigned level; xen_mc_batch(); @@ -425,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - #ifdef CONFIG_X86_PAE - op->cmd = MMUEXT_PIN_L3_TABLE; + level = MMUEXT_PIN_L3_TABLE; #else - op->cmd = MMUEXT_PIN_L2_TABLE; + level = MMUEXT_PIN_L2_TABLE; #endif - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_do_pin(level, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } @@ -442,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd) /* The init_mm pagetable is really pinned as soon as its created, but that's before we have page structures to store the bits. So do all the book-keeping now. */ -static __init int mark_pinned(struct page *page, unsigned flags) +static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -453,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void) pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, unsigned flags) +static int unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); if (pgfl && !PageHighMem(page)) { void *pt = lowmem_page_address(page); unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); + spinlock_t *ptl = NULL; + struct multicall_space mcs; + + if (level == PT_PTE) { + ptl = lock_pte(page); + + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + } + + mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL), - flags); + level == PT_PGD ? UVMF_TLB_FLUSH : 0); + + if (ptl) { + /* unlock when batch completed */ + xen_mc_callback(do_unlock, ptl); + } } return 0; /* never need to flush on unpin */ @@ -473,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags) /* Release a pagetables pages back as normal RW */ static void xen_pgd_unpin(pgd_t *pgd) { - struct mmuext_op *op; - struct multicall_space mcs; - xen_mc_batch(); - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_UNPIN_TABLE; - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); pgd_walk(pgd, unpin_page, TASK_SIZE); @@ -515,20 +564,43 @@ static void drop_other_mm_ref(void *info) if (__get_cpu_var(cpu_tlbstate).active_mm == mm) leave_mm(smp_processor_id()); + + /* If this cpu still has a stale cr3 reference, then make sure + it has been flushed. */ + if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { + load_cr3(swapper_pg_dir); + arch_flush_lazy_cpu_mode(); + } } static void drop_mm_ref(struct mm_struct *mm) { + cpumask_t mask; + unsigned cpu; + if (current->active_mm == mm) { if (current->mm == mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); + arch_flush_lazy_cpu_mode(); } - if (!cpus_empty(mm->cpu_vm_mask)) - xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, - mm, 1); + /* Get the "official" set of cpus referring to our pagetable. */ + mask = mm->cpu_vm_mask; + + /* It's possible that a vcpu may have a stale reference to our + cr3, because its in lazy mode, and it hasn't yet flushed + its set of pending hypercalls yet. In this case, we can + look at its actual current cr3 value, and force it to flush + if needed. */ + for_each_online_cpu(cpu) { + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) + cpu_set(cpu, mask); + } + + if (!cpus_empty(mask)) + xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else static void drop_mm_ref(struct mm_struct *mm) @@ -563,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (PagePinned(virt_to_page(mm->pgd))) xen_pgd_unpin(mm->pgd); + spin_unlock(&mm->page_table_lock); } diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index c837e8e463db..5e6f36f6d876 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -26,13 +26,22 @@ #include "multicalls.h" +#define MC_DEBUG 1 + #define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) struct mc_buffer { struct multicall_entry entries[MC_BATCH]; +#if MC_DEBUG + struct multicall_entry debug[MC_BATCH]; +#endif u64 args[MC_ARGS]; - unsigned mcidx, argidx; + struct callback { + void (*fn)(void *); + void *data; + } callbacks[MC_BATCH]; + unsigned mcidx, argidx, cbidx; }; static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); @@ -43,6 +52,7 @@ void xen_mc_flush(void) struct mc_buffer *b = &__get_cpu_var(mc_buffer); int ret = 0; unsigned long flags; + int i; BUG_ON(preemptible()); @@ -51,13 +61,31 @@ void xen_mc_flush(void) local_irq_save(flags); if (b->mcidx) { - int i; +#if MC_DEBUG + memcpy(b->debug, b->entries, + b->mcidx * sizeof(struct multicall_entry)); +#endif if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) BUG(); for (i = 0; i < b->mcidx; i++) if (b->entries[i].result < 0) ret++; + +#if MC_DEBUG + if (ret) { + printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", + ret, smp_processor_id()); + for(i = 0; i < b->mcidx; i++) { + printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", + i+1, b->mcidx, + b->debug[i].op, + b->debug[i].args[0], + b->entries[i].result); + } + } +#endif + b->mcidx = 0; b->argidx = 0; } else @@ -65,6 +93,13 @@ void xen_mc_flush(void) local_irq_restore(flags); + for(i = 0; i < b->cbidx; i++) { + struct callback *cb = &b->callbacks[i]; + + (*cb->fn)(cb->data); + } + b->cbidx = 0; + BUG_ON(ret); } @@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args) return ret; } + +void xen_mc_callback(void (*fn)(void *), void *data) +{ + struct mc_buffer *b = &__get_cpu_var(mc_buffer); + struct callback *cb; + + if (b->cbidx == MC_BATCH) + xen_mc_flush(); + + cb = &b->callbacks[b->cbidx++]; + cb->fn = fn; + cb->data = data; +} diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index e6f7530b156c..8bae996d99a3 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -35,11 +35,14 @@ void xen_mc_flush(void); /* Issue a multicall if we're not in a lazy mode */ static inline void xen_mc_issue(unsigned mode) { - if ((xen_get_lazy_mode() & mode) == 0) + if ((paravirt_get_lazy_mode() & mode) == 0) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); } +/* Set up a callback to be called when the current batch is flushed */ +void xen_mc_callback(void (*fn)(void *), void *data); + #endif /* _XEN_MULTICALLS_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 557b8e24706a..c1b131bcdcbe 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -147,8 +147,13 @@ void __init xen_smp_prepare_boot_cpu(void) make_lowmem_page_readwrite(&per_cpu__gdt_page); for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + /* + * cpu_core_map lives in a per cpu area that is cleared + * when the per cpu array is allocated. + * + * cpus_clear(per_cpu(cpu_core_map, cpu)); + */ } xen_setup_vcpu_info_placement(); @@ -159,8 +164,13 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus) unsigned cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + /* + * cpu_core_ map will be zeroed when the per + * cpu area is allocated. + * + * cpus_clear(per_cpu(cpu_core_map, cpu)); + */ } smp_store_cpu_info(0); @@ -346,6 +356,7 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) */ irq_enter(); (*func)(info); + __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); if (wait) { @@ -360,7 +371,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, int wait) { struct call_data_struct data; - int cpus; + int cpus, cpu; + bool yield; /* Holding any lock stops cpus from going down. */ spin_lock(&call_lock); @@ -389,9 +401,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), /* Send a message to other CPUs and wait for them to respond */ xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - /* Make sure other vcpus get a chance to run. - XXX too severe? Maybe we should check the other CPU's states? */ - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + /* Make sure other vcpus get a chance to run if they need to. */ + yield = false; + for_each_cpu_mask(cpu, mask) + if (xen_vcpu_stolen(cpu)) + yield = true; + + if (yield) + HYPERVISOR_sched_op(SCHEDOP_yield, 0); /* Wait for response */ while (atomic_read(&data.started) != cpus || diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index dfd6db69ead5..d083ff5ef088 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res) } while (get64(&state->state_entry_time) != state_time); } +/* return true when a vcpu could run but has no real cpu to run on */ +bool xen_vcpu_stolen(int vcpu) +{ + return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; +} + static void setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index b9aaea45f07f..b02a909bfd4c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); +DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info *HYPERVISOR_shared_info; @@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_mark_init_mm_pinned(void); - -DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); +bool xen_vcpu_stolen(int vcpu); -static inline unsigned xen_get_lazy_mode(void) -{ - return x86_read_percpu(xen_lazy_mode); -} +void xen_mark_init_mm_pinned(void); void __init xen_fill_possible_map(void); |