diff options
Diffstat (limited to 'arch/x86_64')
38 files changed, 716 insertions, 266 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 45efe0ca88f8..6527a368fb9c 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT default "7" if GENERIC_CPU || MPSC default "6" if MK8 +config X86_INTERNODE_CACHE_BYTES + int + default "4096" if X86_VSMP + default X86_L1_CACHE_BYTES if !X86_VSMP + config X86_TSC bool default y @@ -250,6 +255,15 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC + bool "Multi-core scheduler support" + depends on SMP + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. If unsure say N here. + source "kernel/Kconfig.preempt" config NUMA @@ -274,12 +288,18 @@ config K8_NUMA Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, which also takes priority if both are compiled in. +config NODES_SHIFT + int + default "6" + depends on NEED_MULTIPLE_NODES + # Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig. config X86_64_ACPI_NUMA bool "ACPI NUMA detection" depends on NUMA select ACPI + select PCI select ACPI_NUMA default y help @@ -325,6 +345,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool y depends on NUMA +config OUT_OF_LINE_PFN_TO_PAGE + def_bool y + depends on DISCONTIGMEM + config NR_CPUS int "Maximum number of CPUs (2-256)" range 2 255 diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 585fd4a559c8..e573e2ab5510 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -24,37 +24,37 @@ LDFLAGS := -m elf_x86_64 OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := - CHECKFLAGS += -D__x86_64__ -m64 +cflags-y := cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) -CFLAGS += $(cflags-y) -CFLAGS += -m64 -CFLAGS += -mno-red-zone -CFLAGS += -mcmodel=kernel -CFLAGS += -pipe +cflags-y += -m64 +cflags-y += -mno-red-zone +cflags-y += -mcmodel=kernel +cflags-y += -pipe cflags-$(CONFIG_REORDER) += -ffunction-sections # this makes reading assembly source easier, but produces worse code # actually it makes the kernel smaller too. -CFLAGS += -fno-reorder-blocks -CFLAGS += -Wno-sign-compare +cflags-y += -fno-reorder-blocks +cflags-y += -Wno-sign-compare ifneq ($(CONFIG_UNWIND_INFO),y) -CFLAGS += -fno-asynchronous-unwind-tables +cflags-y += -fno-asynchronous-unwind-tables endif ifneq ($(CONFIG_DEBUG_INFO),y) # -fweb shrinks the kernel a bit, but the difference is very small # it also messes up debugging, so don't use it for now. -#CFLAGS += $(call cc-option,-fweb) +#cflags-y += $(call cc-option,-fweb) endif # -funit-at-a-time shrinks the kernel .text considerably # unfortunately it makes reading oopses harder. -CFLAGS += $(call cc-option,-funit-at-a-time) +cflags-y += $(call cc-option,-funit-at-a-time) # prevent gcc from generating any FP code by mistake -CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +CFLAGS += $(cflags-y) AFLAGS += -m64 head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S index 0587477c99f2..32327bb37aff 100644 --- a/arch/x86_64/boot/video.S +++ b/arch/x86_64/boot/video.S @@ -97,6 +97,7 @@ #define PARAM_VESAPM_OFF 0x30 #define PARAM_LFB_PAGES 0x32 #define PARAM_VESA_ATTRIB 0x34 +#define PARAM_CAPABILITIES 0x36 /* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ #ifdef CONFIG_VIDEO_RETAIN @@ -233,6 +234,10 @@ mopar_gr: movw 18(%di), %ax movl %eax, %fs:(PARAM_LFB_SIZE) +# store mode capabilities + movl 10(%di), %eax + movl %eax, %fs:(PARAM_CAPABILITIES) + # switching the DAC to 8-bit is for <= 8 bpp only movw %fs:(PARAM_LFB_DEPTH), %ax cmpw $8, %ax diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 566ecc97ee5a..69db0c0721d1 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.16-git9 -# Sat Mar 25 15:18:40 2006 +# Linux kernel version: 2.6.17-rc1-git11 +# Sun Apr 16 07:22:36 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -9,6 +9,7 @@ CONFIG_X86=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_RWSEM_GENERIC_SPINLOCK=y +CONFIG_GENERIC_HWEIGHT=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_CMPXCHG=y CONFIG_EARLY_PRINTK=y @@ -55,11 +56,8 @@ CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y -CONFIG_CC_ALIGN_FUNCTIONS=0 -CONFIG_CC_ALIGN_LABELS=0 -CONFIG_CC_ALIGN_LOOPS=0 -CONFIG_CC_ALIGN_JUMPS=0 CONFIG_SLAB=y +CONFIG_DOUBLEFAULT=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -70,7 +68,6 @@ CONFIG_BASE_SMALL=0 CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_OBSOLETE_MODPARM=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set # CONFIG_KMOD is not set @@ -81,6 +78,7 @@ CONFIG_STOP_MACHINE=y # CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_LSF is not set # # IO Schedulers @@ -105,6 +103,7 @@ CONFIG_X86_PC=y CONFIG_GENERIC_CPU=y CONFIG_X86_L1_CACHE_BYTES=128 CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_INTERNODE_CACHE_BYTES=128 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y # CONFIG_MICROCODE is not set @@ -116,12 +115,14 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_MTRR=y CONFIG_SMP=y CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y # CONFIG_PREEMPT is not set CONFIG_PREEMPT_BKL=y CONFIG_NUMA=y CONFIG_K8_NUMA=y +CONFIG_NODES_SHIFT=6 CONFIG_X86_64_ACPI_NUMA=y CONFIG_NUMA_EMU=y CONFIG_ARCH_DISCONTIGMEM_ENABLE=y @@ -138,6 +139,7 @@ CONFIG_NEED_MULTIPLE_NODES=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y +CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 CONFIG_HOTPLUG_CPU=y CONFIG_HPET_TIMER=y @@ -289,6 +291,7 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set +# CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y @@ -300,6 +303,7 @@ CONFIG_IPV6=y # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set +# CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set # CONFIG_IPV6_TUNNEL is not set # CONFIG_NETFILTER is not set @@ -542,7 +546,6 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_FC is not set # CONFIG_SCSI_QLOGIC_1280 is not set # CONFIG_SCSI_QLA_FC is not set # CONFIG_SCSI_LPFC is not set @@ -704,7 +707,6 @@ CONFIG_S2IO=m # Wireless LAN (non-hamradio) # # CONFIG_NET_RADIO is not set -# CONFIG_NET_WIRELESS_RTNETLINK is not set # # Wan interfaces @@ -791,7 +793,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_SERIAL_8250_ACPI is not set +CONFIG_SERIAL_8250_PCI=y CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set @@ -921,6 +923,7 @@ CONFIG_HWMON=y # Digital Video Broadcasting Devices # # CONFIG_DVB is not set +# CONFIG_USB_DABUSB is not set # # Graphics support @@ -932,6 +935,8 @@ CONFIG_VIDEO_SELECT=y # Console display driver support # CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 CONFIG_DUMMY_CONSOLE=y # @@ -1041,9 +1046,7 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_ACECAD is not set # CONFIG_USB_KBTAB is not set # CONFIG_USB_POWERMATE is not set -# CONFIG_USB_MTOUCH is not set -# CONFIG_USB_ITMTOUCH is not set -# CONFIG_USB_EGALAX is not set +# CONFIG_USB_TOUCHSCREEN is not set # CONFIG_USB_YEALINK is not set # CONFIG_USB_XPAD is not set # CONFIG_USB_ATI_REMOTE is not set @@ -1058,15 +1061,6 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_MICROTEK is not set # -# USB Multimedia devices -# -# CONFIG_USB_DABUSB is not set - -# -# Video4Linux support is needed for USB Multimedia device support -# - -# # USB Network Adapters # # CONFIG_USB_CATC is not set @@ -1118,9 +1112,23 @@ CONFIG_USB_MON=y # CONFIG_MMC is not set # +# LED devices +# +# CONFIG_NEW_LEDS is not set + +# +# LED drivers +# + +# +# LED Triggers +# + +# # InfiniBand support # # CONFIG_INFINIBAND is not set +# CONFIG_IPATH_CORE is not set # # EDAC - error detection and reporting (RAS) (EXPERIMENTAL) @@ -1128,6 +1136,11 @@ CONFIG_USB_MON=y # CONFIG_EDAC is not set # +# Real Time Clock +# +# CONFIG_RTC_CLASS is not set + +# # Firmware Drivers # # CONFIG_EDD is not set diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile index 929e6b0771f8..e9263b4975e0 100644 --- a/arch/x86_64/ia32/Makefile +++ b/arch/x86_64/ia32/Makefile @@ -27,5 +27,5 @@ $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) -AFLAGS_vsyscall-sysenter.o = -m32 -AFLAGS_vsyscall-syscall.o = -m32 +AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 +AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index e776139afb20..926c4743d13b 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -339,7 +339,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, struct mm_struct *mm = current->mm; int i, ret; - stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE; + stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE; mm->arg_start = bprm->p + stack_base; bprm->p += stack_base; @@ -357,7 +357,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, { mpnt->vm_mm = mm; mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; - mpnt->vm_end = IA32_STACK_TOP; + mpnt->vm_end = stack_top; if (executable_stack == EXSTACK_ENABLE_X) mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 7549a4389fbf..5a92fed2d1d5 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -15,6 +15,8 @@ #include <asm/vsyscall32.h> #include <linux/linkage.h> +#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) + .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp @@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target) CFI_REMEMBER_STATE jnz sysenter_tracesys sysenter_do_call: - cmpl $(IA32_NR_syscalls),%eax - jae ia32_badsys + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys IA32_ARG_FIXUP 1 call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target) CFI_REMEMBER_STATE jnz cstar_tracesys cstar_do_call: - cmpl $IA32_NR_syscalls,%eax - jae ia32_badsys + cmpl $IA32_NR_syscalls-1,%eax + ja ia32_badsys IA32_ARG_FIXUP 1 call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -296,8 +298,8 @@ ENTRY(ia32_syscall) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) jnz ia32_tracesys ia32_do_syscall: - cmpl $(IA32_NR_syscalls),%eax - jae ia32_badsys + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys IA32_ARG_FIXUP call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: @@ -685,10 +687,13 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat - .quad sys_ni_syscall /* pselect6 for now */ - .quad sys_ni_syscall /* ppoll for now */ + .quad quiet_ni_syscall /* pselect6 for now */ + .quad quiet_ni_syscall /* ppoll for now */ .quad sys_unshare /* 310 */ + .quad compat_sys_set_robust_list + .quad compat_sys_get_robust_list + .quad sys_splice + .quad sys_sync_file_range + .quad sys_tee + .quad compat_sys_vmsplice ia32_syscall_end: - .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 - .quad ni_syscall - .endr diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S index d90321fe9bba..1384367cdbe1 100644 --- a/arch/x86_64/ia32/vsyscall-sigreturn.S +++ b/arch/x86_64/ia32/vsyscall-sigreturn.S @@ -32,9 +32,28 @@ __kernel_rt_sigreturn: .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn .section .eh_frame,"a",@progbits +.LSTARTFRAMES: + .long .LENDCIES-.LSTARTCIES +.LSTARTCIES: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIES: + .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ .LSTARTFDE2: - .long .LSTARTFDE2-.LSTARTFRAME /* CIE pointer */ + .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */ /* HACK: The dwarf2 unwind routines will subtract 1 from the return address to get an address in the middle of the presumed call instruction. Since we didn't get here via @@ -97,7 +116,7 @@ __kernel_rt_sigreturn: .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ .LSTARTFDE3: - .long .LSTARTFDE3-.LSTARTFRAME /* CIE pointer */ + .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */ /* HACK: See above wrt unwind library assumptions. */ .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index a098a11e7755..059c88313f4e 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - dmi_scan.o pci-dma.o pci-nommu.o + pci-dma.o pci-nommu.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o @@ -49,5 +49,3 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o -dmi_scan-y += ../../i386/kernel/dmi_scan.o - diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index fffd6b0a2fab..70b9d21ed675 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) printk("Aperture from %s beyond 4GB. Ignoring.\n",name); return 0; } - if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) { + if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); return 0; } diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index d54620147e8e..100a30c40044 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -615,7 +615,7 @@ static int __init apic_set_verbosity(char *str) printk(KERN_WARNING "APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug", str); - return 0; + return 1; } __setup("apic=", apic_set_verbosity); @@ -1137,35 +1137,35 @@ int __init APIC_init_uniprocessor (void) static __init int setup_disableapic(char *str) { disable_apic = 1; - return 0; + return 1; } static __init int setup_nolapic(char *str) { disable_apic = 1; - return 0; + return 1; } static __init int setup_noapictimer(char *str) { if (str[0] != ' ' && str[0] != 0) - return -1; + return 0; disable_apic_timer = 1; - return 0; + return 1; } static __init int setup_apicmaintimer(char *str) { apic_runs_main_timer = 1; nohpet = 1; - return 0; + return 1; } __setup("apicmaintimer", setup_apicmaintimer); static __init int setup_noapicmaintimer(char *str) { apic_runs_main_timer = -1; - return 0; + return 1; } __setup("noapicmaintimer", setup_noapicmaintimer); diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 293cd71a266a..1ef6028f721e 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -76,11 +76,22 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) *addrp = __pa_symbol(&_end); return 1; } + + if (last >= ebda_addr && addr < ebda_addr + ebda_size) { + *addrp = ebda_addr + ebda_size; + return 1; + } + /* XXX ramdisk image here? */ return 0; } -int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int __meminit +e820_any_mapped(unsigned long start, unsigned long end, unsigned type) { int i; for (i = 0; i < e820.nr_map; i++) { @@ -94,6 +105,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) return 0; } +/* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + /* * Find a free area in a specific range. */ @@ -109,7 +149,7 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi addr = start; if (addr > ei->addr + ei->size) continue; - while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) + while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; last = addr + size; if (last > ei->addr + ei->size) diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 13af920b6594..b93ef5b51980 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -221,7 +221,7 @@ int __init setup_early_printk(char *opt) char buf[256]; if (early_console_initialized) - return -1; + return 1; strlcpy(buf,opt,sizeof(buf)); space = strchr(buf, ' '); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 8538bfea30e6..586b34c00c48 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -180,6 +180,10 @@ rff_trace: * * XXX if we had a free scratch register we could save the RSP into the stack frame * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. */ ENTRY(system_call) @@ -254,7 +258,10 @@ sysret_signal: xorl %esi,%esi # oldset -> arg2 call ptregscall_common 1: movl $_TIF_NEED_RESCHED,%edi - jmp sysret_check + /* Use IRET because user could have changed frame. This + works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ + cli + jmp int_with_check badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -274,13 +281,9 @@ tracesys: ja 1f movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) -1: SAVE_REST - movq %rsp,%rdi - call syscall_trace_leave - RESTORE_TOP_OF_STACK %rbx - RESTORE_REST - jmp ret_from_sys_call +1: movq %rax,RAX-ARGOFFSET(%rsp) + /* Use IRET because user could have changed frame */ + jmp int_ret_from_sys_call CFI_ENDPROC /* @@ -408,25 +411,9 @@ ENTRY(stub_execve) CFI_ADJUST_CFA_OFFSET -8 CFI_REGISTER rip, r11 SAVE_REST - movq %r11, %r15 - CFI_REGISTER rip, r15 FIXUP_TOP_OF_STACK %r11 call sys_execve - GET_THREAD_INFO(%rcx) - bt $TIF_IA32,threadinfo_flags(%rcx) - CFI_REMEMBER_STATE - jc exec_32bit RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - CFI_REGISTER rip, r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET rip, 0 - ret - -exec_32bit: - CFI_RESTORE_STATE movq %rax,RAX(%rsp) RESTORE_REST jmp int_ret_from_sys_call diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 77b4c608cca0..9cc7031b7151 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -271,6 +271,18 @@ __setup("enable_8254_timer", setup_enable_8254_timer); #include <linux/pci_ids.h> #include <linux/pci.h> + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + /* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC off. Check for an Nvidia or VIA PCI bridge and turn it off. Use pci direct infrastructure because this runs before the PCI subsystem. @@ -317,11 +329,19 @@ void __init check_ioapic(void) return; case PCI_VENDOR_ID_NVIDIA: #ifdef CONFIG_ACPI - /* All timer overrides on Nvidia - seem to be wrong. Skip them. */ - acpi_skip_timer_override = 1; - printk(KERN_INFO - "Nvidia board detected. Ignoring ACPI timer override.\n"); + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, + nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } #endif /* RED-PEN skip them on mptables too? */ return; @@ -1777,6 +1797,8 @@ static inline void unlock_ExtINT_logic(void) spin_unlock_irqrestore(&ioapic_lock, flags); } +int timer_uses_ioapic_pin_0; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -1814,6 +1836,9 @@ static inline void check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index accbff3fec49..fa1d19ca700a 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -53,7 +53,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); /* * returns non-zero if opcode modifies the interrupt flag. */ -static inline int is_IF_modifier(kprobe_opcode_t *insn) +static __always_inline int is_IF_modifier(kprobe_opcode_t *insn) { switch (*insn) { case 0xfa: /* cli */ @@ -84,7 +84,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) * If it does, return the address of the 32-bit displacement word. * If not, return null. */ -static inline s32 *is_riprel(u8 *insn) +static s32 __kprobes *is_riprel(u8 *insn) { #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -229,7 +229,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) mutex_unlock(&kprobe_mutex); } -static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) { kcb->prev_kprobe.kp = kprobe_running(); kcb->prev_kprobe.status = kcb->kprobe_status; @@ -237,7 +237,7 @@ static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; } -static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) { __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; kcb->kprobe_status = kcb->prev_kprobe.status; @@ -245,7 +245,7 @@ static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; } -static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { __get_cpu_var(current_kprobe) = p; @@ -514,13 +514,13 @@ static void __kprobes resume_execution(struct kprobe *p, *tos = orig_rip + (*tos - copy_rip); break; case 0xff: - if ((*insn & 0x30) == 0x10) { + if ((insn[1] & 0x30) == 0x10) { /* call absolute, indirect */ /* Fix return addr; rip is correct. */ next_rip = regs->rip; *tos = orig_rip + (*tos - copy_rip); - } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ /* rip is correct. */ next_rip = regs->rip; } diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 04282ef9fbd4..c69fc43cee7b 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -29,6 +29,8 @@ #define MISC_MCELOG_MINOR 227 #define NR_BANKS 6 +atomic_t mce_entry; + static int mce_dont_init; /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, @@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) int i; int panicm_found = 0; + atomic_inc(&mce_entry); + if (regs) notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL); if (!banks) - return; + goto out2; memset(&m, 0, sizeof(struct mce)); m.cpu = safe_smp_processor_id(); @@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code) out: /* Last thing done in the machine check exception to clear state. */ wrmsrl(MSR_IA32_MCG_STATUS, 0); + out2: + atomic_dec(&mce_entry); } /* @@ -501,7 +507,7 @@ static struct miscdevice mce_log_device = { static int __init mcheck_disable(char *str) { mce_dont_init = 1; - return 0; + return 1; } /* mce=off disables machine check. Note you can reenable it later @@ -521,7 +527,7 @@ static int __init mcheck_enable(char *str) get_option(&str, &tolerant); else printk("mce= argument %s ignored. Please use /sys", str); - return 0; + return 1; } __setup("nomce", mcheck_disable); @@ -623,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) #endif /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index d3ad7d81266d..d13b241ad094 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -482,7 +482,7 @@ static void threshold_remove_device(unsigned int cpu) #endif /* get notified when a cpu comes on/off */ -static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb, +static int threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index b17cf3eba359..083da7e606b1 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -968,7 +968,17 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) */ int irq = gsi; if (gsi < MAX_GSI_NUM) { - if (gsi > 15) + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) gsi = pci_irq++; /* * Don't assign IRQ used by ACPI SCI diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index d9e4067faf05..4e6357fe0ec3 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -34,6 +34,7 @@ #include <asm/proto.h> #include <asm/kdebug.h> #include <asm/local.h> +#include <asm/mce.h> /* * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) __get_cpu_var(nmi_touch) = 0; touched = 1; } +#ifdef CONFIG_X86_MCE + /* Could check oops_in_progress here too, but it's safer + not too */ + if (atomic_read(&mce_entry) > 0) + touched = 1; +#endif if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 03c9eeedb0f3..a9275c9557cf 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -48,10 +48,16 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) { struct page *page; int node; +#ifdef CONFIG_PCI if (dev->bus == &pci_bus_type) node = pcibus_to_node(to_pci_dev(dev)->bus); else +#endif node = numa_node_id(); + + if (node < first_node(node_online_map)) + node = first_node(node_online_map); + page = alloc_pages_node(node, gfp, order); return page ? page_address(page) : NULL; } diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index a6c01e121266..82a7c9bfdfa0 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -112,10 +112,6 @@ static unsigned long alloc_iommu(int size) static void free_iommu(unsigned long offset, int size) { unsigned long flags; - if (size == 1) { - clear_bit(offset, iommu_gart_bitmap); - return; - } spin_lock_irqsave(&iommu_bitmap_lock, flags); __clear_bit_string(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -635,14 +631,20 @@ static int __init pci_iommu_init(void) printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); if (end_pfn > MAX_DMA32_PFN) { printk(KERN_ERR "WARNING more than 4GB of memory " - "but IOMMU not compiled in.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n" - KERN_ERR "You might want to enable " - "CONFIG_GART_IOMMU\n"); + "but IOMMU not available.\n" + KERN_ERR "WARNING 32bit PCI may malfunction.\n"); } return -1; } + i = 0; + for_all_nb(dev) + i++; + if (i > MAX_NB) { + printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i); + return -1; + } + printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); aper_size = info.aper_size * 1024 * 1024; iommu_size = check_iommu_size(info.aper_base, aper_size); diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 44adcc2d5e5b..1f6ecc62061d 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -12,9 +12,10 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { if (hwdev && bus + size > *hwdev->dma_mask) { - printk(KERN_ERR - "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", - name, (long long)bus, size, (long long)*hwdev->dma_mask); + if (*hwdev->dma_mask >= 0xffffffffULL) + printk(KERN_ERR + "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", + name, (long long)bus, size, (long long)*hwdev->dma_mask); return 0; } return 1; diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index ee5ee4891f3d..bf421ed26808 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -68,7 +68,7 @@ int pmtimer_mark_offset(void) offset_delay = delta % (USEC_PER_SEC / HZ); rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * cpu_khz; + vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; /* don't calculate delay for first run, or if we've got less then a tick */ @@ -121,7 +121,7 @@ unsigned int do_gettimeoffset_pm(void) static int __init nopmtimer_setup(char *s) { pmtmr_ioport = 0; - return 0; + return 1; } __setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 0370720515f1..fb903e65e079 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -66,24 +66,17 @@ EXPORT_SYMBOL(boot_option_idle_override); void (*pm_idle)(void); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); -static struct notifier_block *idle_notifier; -static DEFINE_SPINLOCK(idle_notifier_lock); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) { - unsigned long flags; - spin_lock_irqsave(&idle_notifier_lock, flags); - notifier_chain_register(&idle_notifier, n); - spin_unlock_irqrestore(&idle_notifier_lock, flags); + atomic_notifier_chain_register(&idle_notifier, n); } EXPORT_SYMBOL_GPL(idle_notifier_register); void idle_notifier_unregister(struct notifier_block *n) { - unsigned long flags; - spin_lock_irqsave(&idle_notifier_lock, flags); - notifier_chain_unregister(&idle_notifier, n); - spin_unlock_irqrestore(&idle_notifier_lock, flags); + atomic_notifier_chain_unregister(&idle_notifier, n); } EXPORT_SYMBOL(idle_notifier_unregister); @@ -93,13 +86,13 @@ static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; void enter_idle(void) { __get_cpu_var(idle_state) = CPU_IDLE; - notifier_call_chain(&idle_notifier, IDLE_START, NULL); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { __get_cpu_var(idle_state) = CPU_NOT_IDLE; - notifier_call_chain(&idle_notifier, IDLE_END, NULL); + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } /* Called from interrupts to signify idle end */ @@ -582,8 +575,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) prev->userrsp = read_pda(oldrsp); write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); + /* This must be here to ensure both math_state_restore() and - kernel_fpu_begin() work consistently. */ + kernel_fpu_begin() work consistently. + And the AMD workaround requires it to be after DS reload. */ unlazy_fpu(prev_p); write_pda(kernelstack, task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); @@ -788,10 +783,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } case ARCH_GET_GS: { unsigned long base; + unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); - else if (doit) - rdmsrl(MSR_KERNEL_GS_BASE, base); + else if (doit) { + asm("movl %%gs,%0" : "=r" (gsindex)); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index d44b2c1e63a6..2d50024c9f30 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -274,11 +274,6 @@ static int putreg(struct task_struct *child, return -EIO; value &= 0xffff; break; - case offsetof(struct user_regs_struct, rip): - /* Check if the new RIP address is canonical */ - if (value >= TASK_SIZE_OF(child)) - return -EIO; - break; } put_stack_long(child, regno - sizeof(struct pt_regs), value); return 0; @@ -605,12 +600,12 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) if (unlikely(current->audit_context)) { if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(current, AUDIT_ARCH_I386, + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_rax, regs->rbx, regs->rcx, regs->rdx, regs->rsi); } else { - audit_syscall_entry(current, AUDIT_ARCH_X86_64, + audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_rax, regs->rdi, regs->rsi, regs->rdx, regs->r10); @@ -621,7 +616,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs) asmlinkage void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) - audit_syscall_exit(current, AUDITSC_RESULT(regs->rax), regs->rax); + audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); if ((test_thread_flag(TIF_SYSCALL_TRACE) || test_thread_flag(TIF_SINGLESTEP)) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index a57eec8311a7..655b9192eeb3 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p) if (fullarg(from, "enable_timer_pin_1")) disable_timer_pin_1 = -1; - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) + if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); disable_apic = 1; + } if (fullarg(from, "noapic")) skip_ioapic_setup = 1; @@ -540,7 +542,7 @@ void __init alternative_instructions(void) static int __init noreplacement_setup(char *s) { no_replacement = 1; - return 0; + return 1; } __setup("noreplacement", noreplacement_setup); @@ -569,17 +571,28 @@ static inline void copy_edd(void) #endif #define EBDA_ADDR_POINTER 0x40E -static void __init reserve_ebda_region(void) + +unsigned __initdata ebda_addr; +unsigned __initdata ebda_size; + +static void discover_ebda(void) { - unsigned int addr; - /** + /* * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); - addr <<= 4; - if (addr) - reserve_bootmem_generic(addr, PAGE_SIZE); + ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr <<= 4; + + ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; } void __init setup_arch(char **cmdline_p) @@ -625,6 +638,8 @@ void __init setup_arch(char **cmdline_p) check_efer(); + discover_ebda(); + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); dmi_scan_machine(); @@ -667,7 +682,8 @@ void __init setup_arch(char **cmdline_p) reserve_bootmem_generic(0, PAGE_SIZE); /* reserve ebda region */ - reserve_ebda_region(); + if (ebda_addr) + reserve_bootmem_generic(ebda_addr, ebda_size); #ifdef CONFIG_SMP /* @@ -928,6 +944,10 @@ static int __init init_amd(struct cpuinfo_x86 *c) if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); + r = get_model_name(c); if (!r) { switch (c->x86) { @@ -962,7 +982,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) cpuid(1, &eax, &ebx, &ecx, &edx); - c->apicid = phys_pkg_id(0); if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; @@ -1032,7 +1051,7 @@ static void srat_detect_node(void) for now. */ node = apicid_to_node[hard_smp_processor_id()]; if (node == NUMA_NO_NODE) - node = 0; + node = first_node(node_online_map); numa_set_node(cpu, node); if (acpi_numa > 0) @@ -1171,6 +1190,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + c->apicid = phys_pkg_id(0); + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -1419,3 +1440,22 @@ struct seq_operations cpuinfo_op = { .show = show_cpuinfo, }; +#ifdef CONFIG_INPUT_PCSPKR +#include <linux/platform_device.h> +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); +#endif diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index eabdb63fec31..8a691fa6d393 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -55,7 +55,7 @@ int __init nonx_setup(char *str) do_not_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } - return 0; + return 1; } __setup("noexec=", nonx_setup); /* parsed early actually */ @@ -74,7 +74,7 @@ static int __init nonx32_setup(char *str) force_personality32 &= ~READ_IMPLIES_EXEC; else if (!strcmp(str, "off")) force_personality32 |= READ_IMPLIES_EXEC; - return 0; + return 1; } __setup("noexec32=", nonx32_setup); diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 66e98659d077..71a7222cf9ce 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; /* core ID of each logical CPU */ u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +/* Last level cache ID of each logical CPU */ +u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; + /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -350,7 +353,7 @@ static void __cpuinit tsc_sync_wait(void) static __init int notscsync_setup(char *s) { notscsync = 1; - return 0; + return 1; } __setup("notscsync", notscsync_setup); @@ -445,6 +448,18 @@ void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +/* maps the cpu to the sched domain representing multi-core */ +cpumask_t cpu_coregroup_map(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + /* + * For perf, we return last level cache shared map. + * TBD: when power saving sched policy is added, we will return + * cpu_core_map when power saving policy is enabled + */ + return c->llc_shared_map; +} + /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; @@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); } } } else { cpu_set(cpu, cpu_sibling_map[cpu]); } + cpu_set(cpu, c[cpu].llc_shared_map); + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; c[cpu].booted_cores = 1; @@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu) } for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (cpu_llc_id[cpu] != BAD_APICID && + cpu_llc_id[cpu] == cpu_llc_id[i]) { + cpu_set(i, c[cpu].llc_shared_map); + cpu_set(cpu, c[i].llc_shared_map); + } if (phys_proc_id[cpu] == phys_proc_id[i]) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7f58fa682491..7392570f975d 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -504,42 +504,25 @@ unsigned long long sched_clock(void) static unsigned long get_cmos_time(void) { - unsigned int timeout = 1000000, year, mon, day, hour, min, sec; - unsigned char uip = 0, this = 0; + unsigned int year, mon, day, hour, min, sec; unsigned long flags; unsigned extyear = 0; -/* - * The Linux interpretation of the CMOS clock register contents: When the - * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the - * second which has precisely just started. Waiting for this can take up to 1 - * second, we timeout approximately after 2.4 seconds on a machine with - * standard 8.3 MHz ISA bus. - */ - spin_lock_irqsave(&rtc_lock, flags); - while (timeout && (!uip || this)) { - uip |= this; - this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; - timeout--; - } - - /* - * Here we are safe to assume the registers won't change for a whole - * second, so we just go ahead and read them. - */ - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - + do { + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); #ifdef CONFIG_ACPI - if (acpi_fadt.revision >= FADT2_REVISION_ID && acpi_fadt.century) - extyear = CMOS_READ(acpi_fadt.century); + if (acpi_fadt.revision >= FADT2_REVISION_ID && + acpi_fadt.century) + extyear = CMOS_READ(acpi_fadt.century); #endif + } while (sec != CMOS_READ(RTC_SECONDS)); spin_unlock_irqrestore(&rtc_lock, flags); @@ -743,7 +726,7 @@ static __init int late_hpet_init(void) unsigned int ntimer; if (!vxtime.hpet_address) - return -1; + return 0; memset(&hd, 0, sizeof (hd)); @@ -934,6 +917,8 @@ void __init time_init(void) vxtime.hpet_address = 0; if (hpet_use_timer) { + /* set tick_nsec to use the proper rate for HPET */ + tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; #ifdef CONFIG_X86_PM_TIMER @@ -1323,7 +1308,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) static int __init nohpet_setup(char *s) { nohpet = 1; - return 0; + return 1; } __setup("nohpet", nohpet_setup); @@ -1331,7 +1316,7 @@ __setup("nohpet", nohpet_setup); int __init notsc_setup(char *s) { notsc = 1; - return 0; + return 1; } __setup("notsc", notsc_setup); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 7b148309c529..cea335e8746c 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -30,6 +30,7 @@ #include <linux/moduleparam.h> #include <linux/nmi.h> #include <linux/kprobes.h> +#include <linux/kexec.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -69,20 +70,20 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); -struct notifier_block *die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); +ATOMIC_NOTIFIER_HEAD(die_chain); int register_die_notifier(struct notifier_block *nb) { - int err = 0; - unsigned long flags; - vmalloc_sync_all(); - spin_lock_irqsave(&die_notifier_lock, flags); - err = notifier_chain_register(&die_chain, nb); - spin_unlock_irqrestore(&die_notifier_lock, flags); - return err; + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL(register_die_notifier); + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); } +EXPORT_SYMBOL(unregister_die_notifier); static inline void conditional_sti(struct pt_regs *regs) { @@ -101,6 +102,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->eflags & X86_EFLAGS_IF) local_irq_disable(); + /* Make sure to not schedule here because we could be running + on an exception stack. */ preempt_enable_no_resched(); } @@ -384,6 +387,7 @@ void out_of_line_bug(void) static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; +static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { @@ -398,6 +402,7 @@ unsigned __kprobes long oops_begin(void) else spin_lock(&die_lock); } + die_nest_count++; die_owner = cpu; console_verbose(); bust_spinlocks(1); @@ -408,7 +413,13 @@ void __kprobes oops_end(unsigned long flags) { die_owner = -1; bust_spinlocks(0); - spin_unlock_irqrestore(&die_lock, flags); + die_nest_count--; + if (die_nest_count) + /* We still own the lock */ + local_irq_restore(flags); + else + /* Nest count reaches zero, release the lock. */ + spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) panic("Oops"); } @@ -433,6 +444,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk(KERN_ALERT "RIP "); printk_address(regs->rip); printk(" RSP <%016lx>\n", regs->rsp); + if (kexec_should_crash(current)) + crash_kexec(regs); } void die(const char * str, struct pt_regs * regs, long err) @@ -455,10 +468,14 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) */ printk(str, safe_smp_processor_id()); show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); if (panic_on_timeout || panic_on_oops) panic("nmi watchdog"); printk("console shuts up ...\n"); oops_end(flags); + nmi_exit(); + local_irq_enable(); do_exit(SIGSEGV); } @@ -468,8 +485,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { struct task_struct *tsk = current; - conditional_sti(regs); - tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; @@ -506,6 +521,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, NULL); \ } @@ -520,6 +536,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } @@ -533,7 +550,17 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) DO_ERROR(18, SIGSEGV, "reserved", reserved) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) + +/* Runs on IST stack */ +asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) { @@ -667,8 +694,9 @@ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - return; + preempt_conditional_cli(regs); } /* Help handler running on IST stack to switch back to user stack @@ -973,14 +1001,14 @@ void __init trap_init(void) static int __init oops_dummy(char *s) { panic_on_oops = 1; - return -1; + return 1; } __setup("oops=", oops_dummy); static int __init kstack_setup(char *s) { kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 0; + return 1; } __setup("kstack=", kstack_setup); diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 39ff0708f803..b81f473c4a19 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -65,7 +65,7 @@ SECTIONS .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { *(.data.cacheline_aligned) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index d96a9348e5a2..1def21c9f7cd 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -102,8 +102,6 @@ EXPORT_SYMBOL(cpu_callout_map); EXPORT_SYMBOL(screen_info); #endif -EXPORT_SYMBOL(get_wchan); - EXPORT_SYMBOL(rtc_lock); EXPORT_SYMBOL_GPL(set_nmi_callback); @@ -114,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback); #undef memcpy #undef memset #undef memmove -#undef strlen extern void * memset(void *,int,__kernel_size_t); extern size_t strlen(const char *); @@ -123,8 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t); extern void * __memcpy(void *,const void *,__kernel_size_t); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 316c53de47bd..55250593d8c9 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -623,6 +623,6 @@ void vmalloc_sync_all(void) static int __init enable_pagefaulttrace(char *str) { page_fault_trace = 1; - return 0; + return 1; } __setup("pagefaulttrace", enable_pagefaulttrace); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index b04415625442..4ba34e95d835 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -72,7 +72,7 @@ void show_mem(void) show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pfn_to_page(pgdat->node_start_pfn + i); total++; @@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned if (paddr >= end) break; - if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) { set_pud(pud, __pud(0)); continue; } @@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) /* * Memory hotplug specific functions - * These are only for non-NUMA machines right now. */ -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) void online_page(struct page *page) { @@ -520,6 +519,38 @@ void online_page(struct page *page) num_physpages++; } +#ifndef CONFIG_MEMORY_HOTPLUG +/* + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, + * just online the pages. + */ +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +{ + int err = -EIO; + unsigned long pfn; + unsigned long total = 0, mem = 0; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { + if (pfn_valid(pfn)) { + online_page(pfn_to_page(pfn)); + err = 0; + mem++; + } + total++; + } + if (!err) { + z->spanned_pages += total; + z->present_pages += mem; + z->zone_pgdat->node_spanned_pages += total; + z->zone_pgdat->node_present_pages += mem; + } + return err; +} +#endif + +/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ int add_memory(u64 start, u64 size) { struct pglist_data *pgdat = NODE_DATA(0); diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 63c72641b737..b2fac14baac0 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn) } #endif +static void * __init +early_node_mem(int nodeid, unsigned long start, unsigned long end, + unsigned long size) +{ + unsigned long mem = find_e820_area(start, end, size); + void *ptr; + if (mem != -1L) + return __va(mem); + ptr = __alloc_bootmem_nopanic(size, + SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); + if (ptr == 0) { + printk(KERN_ERR "Cannot find %lu bytes in node %d\n", + size, nodeid); + return NULL; + } + return ptr; +} + /* Initialize bootmem allocator for a node */ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; unsigned long nodedata_phys; + void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); start = round_up(start, ZONE_ALIGN); @@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; - nodedata_phys = find_e820_area(start, end, pgdat_size); - if (nodedata_phys == -1L) - panic("Cannot find memory pgdat in node %d\n", nodeid); - - Dprintk("nodedata_phys %lx\n", nodedata_phys); + node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size); + if (node_data[nodeid] == NULL) + return; + nodedata_phys = __pa(node_data[nodeid]); - node_data[nodeid] = phys_to_virt(nodedata_phys); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; @@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en /* Find a place for the bootmem map */ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); - bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT); - if (bootmap_start == -1L) - panic("Not enough continuous space for bootmap on node %d", nodeid); + bootmap = early_node_mem(nodeid, bootmap_start, end, + bootmap_pages<<PAGE_SHIFT); + if (bootmap == NULL) { + if (nodedata_phys < start || nodedata_phys >= end) + free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + node_data[nodeid] = NULL; + return; + } + bootmap_start = __pa(bootmap); Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), @@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); +#ifdef CONFIG_ACPI_NUMA + srat_reserve_add_area(nodeid); +#endif node_set_online(nodeid); } @@ -162,11 +188,13 @@ void __init setup_node_zones(int nodeid) memory. */ memmapsize = sizeof(struct page) * (end_pfn-start_pfn); limit = end_pfn << PAGE_SHIFT; +#ifdef CONFIG_FLAT_NODE_MEM_MAP NODE_DATA(nodeid)->node_mem_map = __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, memmapsize, SMP_CACHE_BYTES, round_down(limit - memmapsize, PAGE_SIZE), limit); +#endif size_zones(zones, holes, start_pfn, end_pfn); free_area_init_node(nodeid, NODE_DATA(nodeid), zones, @@ -335,6 +363,8 @@ __init int numa_setup(char *opt) #ifdef CONFIG_ACPI_NUMA if (!strncmp(opt,"noacpi",6)) acpi_numa = -1; + if (!strncmp(opt,"hotadd=", 7)) + hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 1; } @@ -377,21 +407,6 @@ EXPORT_SYMBOL(node_data); * Should do that. */ -/* Requires pfn_valid(pfn) to be true */ -struct page *pfn_to_page(unsigned long pfn) -{ - int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); - return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map; -} -EXPORT_SYMBOL(pfn_to_page); - -unsigned long page_to_pfn(struct page *page) -{ - return (long)(((page) - page_zone(page)->zone_mem_map) + - page_zone(page)->zone_start_pfn); -} -EXPORT_SYMBOL(page_to_pfn); - int pfn_valid(unsigned long pfn) { unsigned nid; diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 2eb879590dc4..474df22c6ed2 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -15,15 +15,29 @@ #include <linux/bitmap.h> #include <linux/module.h> #include <linux/topology.h> +#include <linux/bootmem.h> +#include <linux/mm.h> #include <asm/proto.h> #include <asm/numa.h> #include <asm/e820.h> +#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ + defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ + && !defined(CONFIG_MEMORY_HOTPLUG) +#define RESERVE_HOTADD 1 +#endif + static struct acpi_table_slit *acpi_slit; static nodemask_t nodes_parsed __initdata; static nodemask_t nodes_found __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata; +static struct bootnode nodes_add[MAX_NUMNODES] __initdata; +static int found_add_area __initdata; +int hotadd_percent __initdata = 0; +#ifndef RESERVE_HOTADD +#define hotadd_percent 0 /* Ignore all settings */ +#endif static u8 pxm2node[256] = { [0 ... 255] = 0xff }; /* Too small nodes confuse the VM badly. Usually they result @@ -71,6 +85,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end) static __init void cutoff_node(int i, unsigned long start, unsigned long end) { struct bootnode *nd = &nodes[i]; + + if (found_add_area) + return; + if (nd->start < start) { nd->start = start; if (nd->end < nd->start) @@ -88,8 +106,11 @@ static __init void bad_srat(void) int i; printk(KERN_ERR "SRAT: SRAT not used.\n"); acpi_numa = -1; + found_add_area = 0; for (i = 0; i < MAX_LOCAL_APIC; i++) apicid_to_node[i] = NUMA_NO_NODE; + for (i = 0; i < MAX_NUMNODES; i++) + nodes_add[i].start = nodes[i].end = 0; } static __init inline int srat_disabled(void) @@ -137,7 +158,8 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) int pxm, node; if (srat_disabled()) return; - if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat(); + if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { + bad_srat(); return; } if (pa->flags.enabled == 0) @@ -155,11 +177,116 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa) pxm, pa->apic_id, node); } +#ifdef RESERVE_HOTADD +/* + * Protect against too large hotadd areas that would fill up memory. + */ +static int hotadd_enough_memory(struct bootnode *nd) +{ + static unsigned long allocated; + static unsigned long last_area_end; + unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; + long mem = pages * sizeof(struct page); + unsigned long addr; + unsigned long allowed; + unsigned long oldpages = pages; + + if (mem < 0) + return 0; + allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; + allowed = (allowed / 100) * hotadd_percent; + if (allocated + mem > allowed) { + unsigned long range; + /* Give them at least part of their hotadd memory upto hotadd_percent + It would be better to spread the limit out + over multiple hotplug areas, but that is too complicated + right now */ + if (allocated >= allowed) + return 0; + range = allowed - allocated; + pages = (range / PAGE_SIZE); + mem = pages * sizeof(struct page); + nd->end = nd->start + range; + } + /* Not completely fool proof, but a good sanity check */ + addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); + if (addr == -1UL) + return 0; + if (pages != oldpages) + printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", + pages << PAGE_SHIFT); + last_area_end = addr + mem; + allocated += mem; + return 1; +} + +/* + * It is fine to add this area to the nodes data it will be used later + * This code supports one contigious hot add area per node. + */ +static int reserve_hotadd(int node, unsigned long start, unsigned long end) +{ + unsigned long s_pfn = start >> PAGE_SHIFT; + unsigned long e_pfn = end >> PAGE_SHIFT; + int changed = 0; + struct bootnode *nd = &nodes_add[node]; + + /* I had some trouble with strange memory hotadd regions breaking + the boot. Be very strict here and reject anything unexpected. + If you want working memory hotadd write correct SRATs. + + The node size check is a basic sanity check to guard against + mistakes */ + if ((signed long)(end - start) < NODE_MIN_SIZE) { + printk(KERN_ERR "SRAT: Hotplug area too small\n"); + return -1; + } + + /* This check might be a bit too strict, but I'm keeping it for now. */ + if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); + return -1; + } + + if (!hotadd_enough_memory(&nodes_add[node])) { + printk(KERN_ERR "SRAT: Hotplug area too large\n"); + return -1; + } + + /* Looks good */ + + found_add_area = 1; + if (nd->start == nd->end) { + nd->start = start; + nd->end = end; + changed = 1; + } else { + if (nd->start == end) { + nd->start = start; + changed = 1; + } + if (nd->end == start) { + nd->end = end; + changed = 1; + } + if (!changed) + printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); + } + + if ((nd->end >> PAGE_SHIFT) > end_pfn) + end_pfn = nd->end >> PAGE_SHIFT; + + if (changed) + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + return 0; +} +#endif + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) { - struct bootnode *nd; + struct bootnode *nd, oldnode; unsigned long start, end; int node, pxm; int i; @@ -172,6 +299,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) } if (ma->flags.enabled == 0) return; + if (ma->flags.hot_pluggable && hotadd_percent == 0) + return; start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); pxm = ma->proximity_domain; @@ -181,10 +310,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) bad_srat(); return; } - /* It is fine to add this area to the nodes data it will be used later*/ - if (ma->flags.hot_pluggable == 1) - printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", - start, end); i = conflicting_nodes(start, end); if (i == node) { printk(KERN_WARNING @@ -199,6 +324,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) return; } nd = &nodes[node]; + oldnode = *nd; if (!node_test_and_set(node, nodes_parsed)) { nd->start = start; nd->end = end; @@ -208,8 +334,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + +#ifdef RESERVE_HOTADD + if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { + /* Ignore hotadd region. Undo damage */ + printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + *nd = oldnode; + if ((nd->start | nd->end) == 0) + node_clear(node, nodes_parsed); + } +#endif } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -225,6 +362,9 @@ static int nodes_cover_memory(void) unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; pxmram -= e820_hole_size(s, e); + pxmram -= nodes_add[i].end - nodes_add[i].start; + if ((long)pxmram < 0) + pxmram = 0; } e820ram = end_pfn - e820_hole_size(0, end_pfn); @@ -258,9 +398,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { - cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) + cutoff_node(i, start, end); + if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { unparse_node(i); + node_set_offline(i); + } } if (acpi_numa <= 0) @@ -282,6 +424,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* Finally register nodes */ for_each_node_mask(i, nodes_parsed) setup_node_bootmem(i, nodes[i].start, nodes[i].end); + /* Try again in case setup_node_bootmem missed one due + to missing bootmem */ + for_each_node_mask(i, nodes_parsed) + if (!node_online(i)) + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + for (i = 0; i < NR_CPUS; i++) { if (cpu_to_node[i] == NUMA_NO_NODE) continue; @@ -303,6 +451,25 @@ static int node_to_pxm(int n) return 0; } +void __init srat_reserve_add_area(int nodeid) +{ + if (found_add_area && nodes_add[nodeid].end) { + u64 total_mb; + + printk(KERN_INFO "SRAT: Reserving hot-add memory space " + "for node %d at %Lx-%Lx\n", + nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); + total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) + >> PAGE_SHIFT; + total_mb *= sizeof(struct page); + total_mb >>= 20; + printk(KERN_INFO "SRAT: This will cost you %Lu MB of " + "pre-allocated memory.\n", (unsigned long long)total_mb); + reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, + nodes_add[nodeid].end - nodes_add[nodeid].start); + } +} + int __node_distance(int a, int b) { int index; diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index e616500207e4..a2060e4d5de6 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -9,11 +9,16 @@ #include <linux/init.h> #include <linux/acpi.h> #include <linux/bitmap.h> +#include <asm/e820.h> + #include "pci.h" #define MMCONFIG_APER_SIZE (256*1024*1024) +/* Verify the first 16 busses. We assume that systems with more busses + get MCFG right. */ +#define MAX_CHECK_BUS 16 -static DECLARE_BITMAP(fallback_slots, 32); +static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS); /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { @@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus) static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) { char __iomem *addr; - if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots)) + if (seg == 0 && bus < MAX_CHECK_BUS && + test_bit(32*bus + PCI_SLOT(devfn), fallback_slots)) return NULL; addr = get_virt(seg, bus); if (!addr) @@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus, char __iomem *addr; /* Why do we have this when nobody checks it. How about a BUG()!? -AK */ - if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095))) + if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) { + *value = -1; return -EINVAL; + } addr = pci_dev_base(seg, bus, devfn); if (!addr) @@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = { Normally this can be expressed in the MCFG by not listing them and assigning suitable _SEGs, but this isn't implemented in some BIOS. Instead try to discover all devices on bus 0 that are unreachable using MM - and fallback for them. - We only do this for bus 0/seg 0 */ + and fallback for them. */ static __init void unreachable_devices(void) { - int i; - for (i = 0; i < 32; i++) { - u32 val1; - char __iomem *addr; - - pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1); - if (val1 == 0xffffffff) - continue; - addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0)); - if (addr == NULL|| readl(addr) != val1) { - set_bit(i, fallback_slots); + int i, k; + /* Use the max bus number from ACPI here? */ + for (k = 0; k < MAX_CHECK_BUS; k++) { + for (i = 0; i < 32; i++) { + u32 val1; + char __iomem *addr; + + pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1); + if (val1 == 0xffffffff) + continue; + addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); + if (addr == NULL|| readl(addr) != val1) { + set_bit(i + 32*k, fallback_slots); + printk(KERN_NOTICE + "PCI: No mmconfig possible on device %x:%x\n", + k, i); + } } } } @@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void) (pci_mmcfg_config[0].base_address == 0)) return; + if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE, + E820_RESERVED)) { + printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n"); + printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + return; + } + /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { |