From 3f4a0b917ce72ef47e438d354c433eb645218e87 Mon Sep 17 00:00:00 2001 From: john stultz Date: Tue, 17 Oct 2006 00:09:32 -0700 Subject: [PATCH] i386 Time: Avoid PIT SMP lockups Avoid possible PIT livelock issues seen on SMP systems (and reported by Andi), by not allowing it as a clocksource on SMP boxes. However, since the PIT may no longer be present, we have to properly handle the cases where SMP systems have TSC skew and fall back from the TSC. Since the PIT isn't there, it would "fall back" to the TSC again. So this changes the jiffies rating to 1, and the TSC-bad rating value to 0. Thus you will get the following behavior priority on i386 systems: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if UP] jiffies Rather then the current more complicated: tsc [if present & stable] hpet [if present] cyclone [if present] acpi_pm [if present] pit [if cpus < 4] tsc [if present & unstable] jiffies Signed-off-by: John Stultz Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/i8253.c | 2 +- arch/i386/kernel/tsc.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c index 477b24daff53..9a0060b92e32 100644 --- a/arch/i386/kernel/i8253.c +++ b/arch/i386/kernel/i8253.c @@ -109,7 +109,7 @@ static struct clocksource clocksource_pit = { static int __init init_pit_clocksource(void) { - if (num_possible_cpus() > 4) /* PIT does not scale! */ + if (num_possible_cpus() > 1) /* PIT does not scale! */ return 0; clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index b8fa0a8b2e47..fbc95828cd74 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -349,8 +349,8 @@ static int tsc_update_callback(void) int change = 0; /* check to see if we should switch to the safe clocksource: */ - if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { - clocksource_tsc.rating = 50; + if (clocksource_tsc.rating != 0 && check_tsc_unstable()) { + clocksource_tsc.rating = 0; clocksource_reselect(); change = 1; } @@ -461,7 +461,7 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.shift); /* lower the rating if we already know its unstable: */ if (check_tsc_unstable()) - clocksource_tsc.rating = 50; + clocksource_tsc.rating = 0; init_timer(&verify_tsc_freq_timer); verify_tsc_freq_timer.function = verify_tsc_freq; -- cgit v1.2.1 From a460e745e8f9c75a0525ff94154a0629f9d3e05d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Oct 2006 00:10:03 -0700 Subject: [PATCH] genirq: clean up irq-flow-type naming Introduce desc->name and eliminate the handle_irq_name() hack. Add set_irq_chip_and_handler_name() to set the flow type and name at once. Signed-off-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: "Eric W. Biederman" Cc: Matthew Wilcox Cc: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/i8259.c | 7 ++++--- arch/i386/kernel/io_apic.c | 17 ++++++++++------- arch/i386/kernel/irq.c | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index d53eafb6daa7..62996cd17084 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -113,7 +113,8 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<name); - seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); + seq_printf(p, "-%-8s", irq_desc[i].name); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) -- cgit v1.2.1 From 3864c4894a7f4c03d69a90082a5bb0ab10e437ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Oct 2006 00:10:26 -0700 Subject: [PATCH] lockdep: annotate i386 apm Lockdep doesn't like to enable interrupts when they are enabled already. BUG: warning at kernel/lockdep.c:1814/trace_hardirqs_on() (Not tainted) [] show_trace_log_lvl+0x58/0x16a [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] trace_hardirqs_on+0xa2/0x11e [] apm_bios_call_simple+0xcd/0xfd [] apm+0x92/0x5b1 [] kernel_thread_helper+0x5/0xb DWARF2 unwinder stuck at kernel_thread_helper+0x5/0xb Leftover inexact backtrace: [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] trace_hardirqs_on+0xa2/0x11e [] apm_bios_call_simple+0xcd/0xfd [] apm+0x92/0x5b1 [] kernel_thread_helper+0x5/0xb Signed-off-by: Peter Zijlstra Cc: Stephen Rothwell Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/apm.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index b42f2d914af3..2af65858d322 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -540,11 +540,30 @@ static inline void apm_restore_cpus(cpumask_t mask) * Also, we KNOW that for the non error case of apm_bios_call, there * is no useful data returned in the low order 8 bits of eax. */ -#define APM_DO_CLI \ - if (apm_info.allow_ints) \ - local_irq_enable(); \ - else \ + +static inline unsigned long __apm_irq_save(void) +{ + unsigned long flags; + local_save_flags(flags); + if (apm_info.allow_ints) { + if (irqs_disabled_flags(flags)) + local_irq_enable(); + } else + local_irq_disable(); + + return flags; +} + +#define apm_irq_save(flags) \ + do { flags = __apm_irq_save(); } while (0) + +static inline void apm_irq_restore(unsigned long flags) +{ + if (irqs_disabled_flags(flags)) local_irq_disable(); + else if (irqs_disabled()) + local_irq_enable(); +} #ifdef APM_ZERO_SEGS # define APM_DECL_SEGS \ @@ -596,12 +615,11 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; - local_save_flags(flags); - APM_DO_CLI; + apm_irq_save(flags); APM_DO_SAVE_SEGS; apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); APM_DO_RESTORE_SEGS; - local_irq_restore(flags); + apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); @@ -640,12 +658,11 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; - local_save_flags(flags); - APM_DO_CLI; + apm_irq_save(flags); APM_DO_SAVE_SEGS; error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); APM_DO_RESTORE_SEGS; - local_irq_restore(flags); + apm_irq_restore(flags); gdt[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); -- cgit v1.2.1 From b5e4efe7e061ff52ac97b9fa45acca529d8daeea Mon Sep 17 00:00:00 2001 From: "eiichiro.oiwa.nm@hitachi.com" Date: Thu, 28 Sep 2006 13:55:47 +0900 Subject: PCI: Turn pci_fixup_video into generic for embedded VGA pci_fixup_video turns into generic code because there are many platforms need this fixup for embedded VGA as well as x86. The Video BIOS integrates into System BIOS on a machine has embedded VGA although embedded VGA generally don't have PCI ROM. As a result, embedded VGA need the way that the sysfs rom points to the Video BIOS of System RAM (0xC0000). PCI-to-PCI Bridge Architecture specification describes the condition whether or not PCI ROM forwards VGA compatible memory address. fixup_video suits this specification. Although the Video ROM generally implements in x86 code regardless of platform, some application such as X Window System can run this code by dosemu86. Therefore, pci_fixup_video should turn into generic code. Signed-off-by: Eiichiro Oiwa Acked-by: Alan Cox Acked-by: Jesse Barnes Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/fixup.c | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c index b60d7e8689ed..908b410f4c93 100644 --- a/arch/i386/pci/fixup.c +++ b/arch/i386/pci/fixup.c @@ -342,51 +342,6 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_r DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); -/* - * Fixup to mark boot BIOS video selected by BIOS before it changes - * - * From information provided by "Jon Smirl" - * - * The standard boot ROM sequence for an x86 machine uses the BIOS - * to select an initial video card for boot display. This boot video - * card will have it's BIOS copied to C0000 in system RAM. - * IORESOURCE_ROM_SHADOW is used to associate the boot video - * card with this copy. On laptops this copy has to be used since - * the main ROM may be compressed or combined with another image. - * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW - * is marked here since the boot video device will be the only enabled - * video device at this point. - */ - -static void __devinit pci_fixup_video(struct pci_dev *pdev) -{ - struct pci_dev *bridge; - struct pci_bus *bus; - u16 config; - - if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) - return; - - /* Is VGA routed to us? */ - bus = pdev->bus; - while (bus) { - bridge = bus->self; - if (bridge) { - pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, - &config); - if (!(config & PCI_BRIDGE_CTL_VGA)) - return; - } - bus = bus->parent; - } - pci_read_config_word(pdev, PCI_COMMAND, &config); - if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { - pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; - printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev)); - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); - /* * Some Toshiba laptops need extra code to enable their TI TSB43AB22/A. * -- cgit v1.2.1 From 6b4b78fed47e7380dfe9280b154e8b9bfcd4c86c Mon Sep 17 00:00:00 2001 From: Matt Domsch Date: Fri, 29 Sep 2006 15:23:23 -0500 Subject: PCI: optionally sort device lists breadth-first Problem: New Dell PowerEdge servers have 2 embedded ethernet ports, which are labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and in the printed documentation. Assuming no other add-in ethernet ports in the system, Linux 2.4 kernels name these eth0 and eth1 respectively. Many people have come to expect this naming. Linux 2.6 kernels name these eth1 and eth0 respectively (backwards from expectations). I also have reports that various Sun and HP servers have similar behavior. Root cause: Linux 2.4 kernels walk the pci_devices list, which happens to be sorted in breadth-first order (or pcbios_find_device order on i386, which most often is breadth-first also). 2.6 kernels have both the pci_devices list and the pci_bus_type.klist_devices list, the latter is what is walked at driver load time to match the pci_id tables; this klist happens to be in depth-first order. On systems where, for physical routing reasons, NIC1 appears on a lower bus number than NIC2, but NIC2's bridge is discovered first in the depth-first ordering, NIC2 will be discovered before NIC1. If the list were sorted breadth-first, NIC1 would be discovered before NIC2. A PowerEdge 1955 system has the following topology which easily exhibits the difference between depth-first and breadth-first device lists. -[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub +-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0) +-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1) Other factors, such as device driver load order and the presence of PCI slots at various points in the bus hierarchy further complicate this problem; I'm not trying to solve those here, just restore the device order, and thus basic behavior, that 2.4 kernels had. Solution: The solution can come in multiple steps. Suggested fix #1: kernel Patch below optionally sorts the two device lists into breadth-first ordering to maintain compatibility with 2.4 kernels. It adds two new command line options: pci=bfsort pci=nobfsort to force the sort order, or not, as you wish. It also adds DMI checks for the specific Dell systems which exhibit "backwards" ordering, to make them "right". Suggested fix #2: udev rules from userland Many people also have the expectation that embedded NICs are always discovered before add-in NICs (which this patch does not try to do). Using the PCI IRQ Routing Table provided by system BIOS, it's easy to determine which PCI devices are embedded, or if add-in, which PCI slot they're in. I'm working on a tool that would allow udev to name ethernet devices in ascending embedded, slot 1 .. slot N order, subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it independent of udev as well for those distributions that don't use udev in their installers. Suggested fix #3: system board routing rules One can constrain the system board layout to put NIC1 ahead of NIC2 regardless of breadth-first or depth-first discovery order. This adds a significant level of complexity to board routing, and may not be possible in all instances (witness the above systems from several major manufacturers). I don't want to encourage this particular train of thought too far, at the expense of not doing #1 or #2 above. Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade with 2.6.18. You'll also note I took some liberty and temporarily break the klist abstraction to simplify and speed up the sort algorithm. I think that's both safe and appropriate in this instance. Signed-off-by: Matt Domsch Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/common.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- arch/i386/pci/pci.h | 7 ++++++ 2 files changed, 64 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 68bce194e688..6d5ace845e44 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -20,6 +20,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; +int pci_bf_sort; int pci_routeirq; int pcibios_last_bus = -1; unsigned long pirq_table_addr; @@ -117,6 +118,20 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b) pci_read_bridge_bases(b); } +/* + * Only use DMI information to set this if nothing was passed + * on the kernel command line (which was parsed earlier). + */ + +static int __devinit set_bf_sort(struct dmi_system_id *d) +{ + if (pci_bf_sort == pci_bf_sort_default) { + pci_bf_sort = pci_dmi_bf; + printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident); + } + return 0; +} + /* * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) */ @@ -130,11 +145,11 @@ static int __devinit assign_all_busses(struct dmi_system_id *d) } #endif +static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { +#ifdef __i386__ /* * Laptops which need pci=assign-busses to see Cardbus cards */ -static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { -#ifdef __i386__ { .callback = assign_all_busses, .ident = "Samsung X20 Laptop", @@ -144,6 +159,38 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = { }, }, #endif /* __i386__ */ + { + .callback = set_bf_sort, + .ident = "Dell PowerEdge 1950", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"), + }, + }, + { + .callback = set_bf_sort, + .ident = "Dell PowerEdge 1955", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"), + }, + }, + { + .callback = set_bf_sort, + .ident = "Dell PowerEdge 2900", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"), + }, + }, + { + .callback = set_bf_sort, + .ident = "Dell PowerEdge 2950", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"), + }, + }, {} }; @@ -189,6 +236,8 @@ static int __init pcibios_init(void) pcibios_resource_survey(); + if (pci_bf_sort >= pci_force_bf) + pci_sort_breadthfirst(); #ifdef CONFIG_PCI_BIOS if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT)) pcibios_sort(); @@ -203,6 +252,12 @@ char * __devinit pcibios_setup(char *str) if (!strcmp(str, "off")) { pci_probe = 0; return NULL; + } else if (!strcmp(str, "bfsort")) { + pci_bf_sort = pci_force_bf; + return NULL; + } else if (!strcmp(str, "nobfsort")) { + pci_bf_sort = pci_force_nobf; + return NULL; } #ifdef CONFIG_PCI_BIOS else if (!strcmp(str, "bios")) { diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index 1814f74569c6..ad065cebd7b9 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -30,6 +30,13 @@ extern unsigned int pci_probe; extern unsigned long pirq_table_addr; +enum pci_bf_sort_state { + pci_bf_sort_default, + pci_force_nobf, + pci_force_bf, + pci_dmi_bf, +}; + /* pci-i386.c */ extern unsigned int pcibios_max_latency; -- cgit v1.2.1 From 3fcfab16c5b86eaa3db3a9a31adba550c5b67141 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Oct 2006 23:28:16 -0700 Subject: [PATCH] separate bdi congestion functions from queue congestion functions Separate out the concept of "queue congestion" from "backing-dev congestion". Congestion is a backing-dev concept, not a queue concept. The blk_* congestion functions are retained, as wrappers around the core backing-dev congestion functions. This proper layering is needed so that NFS can cleanly use the congestion functions, and so that CONFIG_BLOCK=n actually links. Cc: "Thomas Maier" Cc: "Jens Axboe" Cc: Trond Myklebust Cc: David Howells Cc: Peter Osterlund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/lib/usercopy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c index 258df6b4d7d7..d22cfc9d656c 100644 --- a/arch/i386/lib/usercopy.c +++ b/arch/i386/lib/usercopy.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -741,7 +742,7 @@ survive: if (retval == -ENOMEM && is_init(current)) { up_read(¤t->mm->mmap_sem); - blk_congestion_wait(WRITE, HZ/50); + congestion_wait(WRITE, HZ/50); goto survive; } -- cgit v1.2.1 From e51959faa61278c762389802faf8ba1a40676628 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Oct 2006 23:29:04 -0700 Subject: [PATCH] Fix potential interrupts during alternative patching Interrupts must be disabled during alternative instruction patching. On systems with high timer IRQ rates, or when running in an emulator, timing differences can result in random kernel panics because of running partially patched instructions. This doesn't yet fix NMIs, which requires extricating the patch code from the late bug checking and is logically separate (and also less likely to cause problems). Signed-off-by: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/alternative.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 28ab80649764..583c238e17fb 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -344,6 +344,7 @@ void alternatives_smp_switch(int smp) void __init alternative_instructions(void) { + unsigned long flags; if (no_replacement) { printk(KERN_INFO "(SMP-)alternatives turned off\n"); free_init_pages("SMP alternatives", @@ -351,6 +352,8 @@ void __init alternative_instructions(void) (unsigned long)__smp_alt_end); return; } + + local_irq_save(flags); apply_alternatives(__alt_instructions, __alt_instructions_end); /* switch to patch-once-at-boottime-only mode and free the @@ -386,4 +389,5 @@ void __init alternative_instructions(void) alternatives_smp_switch(0); } #endif + local_irq_restore(flags); } -- cgit v1.2.1 From 13892de19eb9007ea47430c701bdbf69df71d883 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH] i386: Update defconfig Signed-off-by: Andi Kleen --- arch/i386/defconfig | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 60c0c02574f0..97aacd6bd7d8 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.19-rc1 -# Thu Oct 5 13:04:53 2006 +# Linux kernel version: 2.6.19-rc2-git4 +# Sat Oct 21 03:38:56 2006 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -380,8 +380,8 @@ CONFIG_INET6_XFRM_MODE_TRANSPORT=y CONFIG_INET6_XFRM_MODE_TUNNEL=y # CONFIG_INET6_XFRM_MODE_BEET is not set # CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set +CONFIG_IPV6_SIT=y # CONFIG_IPV6_TUNNEL is not set -# CONFIG_IPV6_SUBTREES is not set # CONFIG_IPV6_MULTIPLE_TABLES is not set # CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set @@ -482,6 +482,13 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# +# Misc devices +# +# CONFIG_IBM_ASM is not set +# CONFIG_SGI_IOC4 is not set +# CONFIG_TIFM_CORE is not set + # # ATA/ATAPI/MFM/RLL support # @@ -1024,6 +1031,7 @@ CONFIG_HANGCHECK_TIMER=y # # Dallas's 1-wire bus # +# CONFIG_W1 is not set # # Hardware Monitoring support @@ -1031,12 +1039,6 @@ CONFIG_HANGCHECK_TIMER=y # CONFIG_HWMON is not set # CONFIG_HWMON_VID is not set -# -# Misc devices -# -# CONFIG_IBM_ASM is not set -# CONFIG_TIFM_CORE is not set - # # Multimedia devices # @@ -1169,7 +1171,6 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_ATI_REMOTE2 is not set # CONFIG_USB_KEYSPAN_REMOTE is not set # CONFIG_USB_APPLETOUCH is not set -# CONFIG_USB_TRANCEVIBRATOR is not set # # USB Imaging devices @@ -1215,6 +1216,7 @@ CONFIG_USB_MON=y # CONFIG_USB_APPLEDISPLAY is not set # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set +# CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_TEST is not set # @@ -1284,6 +1286,7 @@ CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y # CONFIG_EXT3_FS_SECURITY is not set +# CONFIG_EXT4DEV_FS is not set CONFIG_JBD=y # CONFIG_JBD_DEBUG is not set CONFIG_FS_MBCACHE=y @@ -1307,6 +1310,7 @@ CONFIG_DNOTIFY=y # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y # # CD-ROM/DVD Filesystems @@ -1384,7 +1388,6 @@ CONFIG_SUNRPC=y # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set # CONFIG_9P_FS is not set -CONFIG_GENERIC_ACL=y # # Partition Types @@ -1436,10 +1439,6 @@ CONFIG_NLS_ISO8859_15=y # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y -# -# Distributed Lock Manager -# - # # Instrumentation Support # @@ -1480,6 +1479,7 @@ CONFIG_DEBUG_BUGVERBOSE=y CONFIG_UNWIND_INFO=y CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set +# CONFIG_HEADERS_CHECK is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set CONFIG_EARLY_PRINTK=y -- cgit v1.2.1 From da8604cc2d6cd4cbde873c7ba308365e402ac7bf Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH] i386: fix .cfi_signal_frame copy-n-paste error This was copied, pasted but not edited. Cc: Andi Kleen Cc: Jan Beulich Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 7cc0b189b82b..2d9d756fdd74 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -51,8 +51,8 @@ cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) # is .cfi_signal_frame supported too? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) -AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) CFLAGS += $(cflags-y) -- cgit v1.2.1 From cdfce1f5714fec7b24715f569b2fee1607350a6d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:01 +0200 Subject: [PATCH] x86: Use -maccumulate-outgoing-args This avoids some problems with gcc 4.x and earlier generating invalid unwind information. In 4.1 the option is default when unwind information is enabled. And it seems to generate smaller code too, so it's probably a good thing on its own. With gcc 4.0: i386: 4683198 902112 480868 6066178 5c9002 vmlinux (before) 4449895 902112 480868 5832875 5900ab vmlinux (after) x86-64: 4939761 1449584 648216 7037561 6b6279 vmlinux (before) 4854193 1449584 648216 6951993 6a1439 vmlinux (after) On 4.1 it shouldn't make much difference because it is default when unwind is enabled anyways. Suggested by Michael Matz and Jan Beulich Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/Makefile | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 2d9d756fdd74..0677908dfa06 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -42,6 +42,10 @@ cflags-$(CONFIG_REGPARM) += -mregparm=3 # temporary until string.h is fixed cflags-y += -ffreestanding +# this works around some issues with generating unwind tables in older gccs +# newer gccs do it by default +cflags-y += -maccumulate-outgoing-args + # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) -- cgit v1.2.1 From 26fd5e084e470dbe8edc6f726fc918e89b9f988c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH] i386: Fix fake return address The fake return address was being set to __KERNEL_PDA, rather than 0. Push it earlier while %eax still equals 0. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Andrew Morton --- arch/i386/kernel/head.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index be9d883c62ce..ca31f18d277c 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -317,7 +317,7 @@ is386: movl $2,%ecx # set MP movl %eax,%gs lldt %ax cld # gcc2 wants the direction flag cleared at all times - pushl %eax # fake return address + pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP movb ready, %cl movb $1, ready -- cgit v1.2.1 From a1bae67243512ca30ceda48e3e24e25b543f8ab7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH] i386: Disable nmi watchdog on all ThinkPads Even newer Thinkpads have bugs in SMM code that causes hangs with NMI watchdog. Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 3e8e3adb0489..eaafe233a5da 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -219,11 +219,11 @@ static int __init check_nmi_watchdog(void) int cpu; /* Enable NMI watchdog for newer systems. - Actually it should be safe for most systems before 2004 too except - for some IBM systems that corrupt registers when NMI happens - during SMM. Unfortunately we don't have more exact information - on these and use this coarse check. */ - if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) + Probably safe on most older systems too, but let's be careful. + IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM + which hangs the system. Disable watchdog for all thinkpads */ + if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 && + !dmi_name_in_vendors("ThinkPad")) nmi_watchdog = NMI_LOCAL_APIC; if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) -- cgit v1.2.1 From 8cf2c51927bbeefafc25193d01b91f9ed3806e96 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 21 Oct 2006 18:37:02 +0200 Subject: [PATCH] x86: Revert new unwind kernel stack termination Jan convinced me that it was unnecessary because the assembly stubs do this already on the stack. Cc: jbeulich@novell.com Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 57d375900afb..1e1fa3e391a3 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -336,7 +336,6 @@ extern void kernel_thread_helper(void); int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { struct pt_regs regs; - int err; memset(®s, 0, sizeof(regs)); @@ -351,10 +350,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ - err = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); - if (err == 0) /* terminate kernel stack */ - task_pt_regs(current)->eip = 0; - return err; + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } EXPORT_SYMBOL(kernel_thread); -- cgit v1.2.1 From 6b5c76b8e2ff204fa8d7201acce461188873bf2b Mon Sep 17 00:00:00 2001 From: Eiichiro Oiwa Date: Mon, 23 Oct 2006 15:14:07 +0900 Subject: PCI: fix pci_fixup_video as it blows up on sparc64 This reverts much of the original pci_fixup_video change and makes it work for all arches that need it. fixed, and tested on x86, x86_64 and IA64 dig. Signed-off-by: Eiichiro Oiwa Acked-by: David Miller Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/fixup.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c index 908b410f4c93..c1949ff38d61 100644 --- a/arch/i386/pci/fixup.c +++ b/arch/i386/pci/fixup.c @@ -342,6 +342,61 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_r DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); +/* + * Fixup to mark boot BIOS video selected by BIOS before it changes + * + * From information provided by "Jon Smirl" + * + * The standard boot ROM sequence for an x86 machine uses the BIOS + * to select an initial video card for boot display. This boot video + * card will have it's BIOS copied to C0000 in system RAM. + * IORESOURCE_ROM_SHADOW is used to associate the boot video + * card with this copy. On laptops this copy has to be used since + * the main ROM may be compressed or combined with another image. + * See pci_map_rom() for use of this flag. IORESOURCE_ROM_SHADOW + * is marked here since the boot video device will be the only enabled + * video device at this point. + */ + +static void __devinit pci_fixup_video(struct pci_dev *pdev) +{ + struct pci_dev *bridge; + struct pci_bus *bus; + u16 config; + + if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + return; + + /* Is VGA routed to us? */ + bus = pdev->bus; + while (bus) { + bridge = bus->self; + + /* + * From information provided by + * "David Miller" + * The bridge control register is valid for PCI header + * type BRIDGE, or CARDBUS. Host to PCI controllers use + * PCI header type NORMAL. + */ + if (bridge + &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) + ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { + pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, + &config); + if (!(config & PCI_BRIDGE_CTL_VGA)) + return; + } + bus = bus->parent; + } + pci_read_config_word(pdev, PCI_COMMAND, &config); + if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { + pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; + printk(KERN_DEBUG "Boot video device is %s\n", pci_name(pdev)); + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); + /* * Some Toshiba laptops need extra code to enable their TI TSB43AB22/A. * -- cgit v1.2.1 From 61ce1efe6e40233663d27ab8ac9ba9710eebcaad Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 27 Oct 2006 11:41:44 -0700 Subject: [PATCH] vmlinux.lds: consolidate initcall sections Add a vmlinux.lds.h helper macro for defining the eight-level initcall table, teach all the architectures to use it. This is a prerequisite for a patch which performs initcall synchronisation for multithreaded-probing. Cc: Greg KH Signed-off-by: Andrew Morton [ Added AVR32 as well ] Signed-off-by: Haavard Skinnemoen Signed-off-by: Linus Torvalds --- arch/i386/kernel/vmlinux.lds.S | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 1e7ac1c44ddc..adc1f232afee 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -126,13 +126,7 @@ SECTIONS __setup_end = .; __initcall_start = .; .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - *(.initcall1.init) - *(.initcall2.init) - *(.initcall3.init) - *(.initcall4.init) - *(.initcall5.init) - *(.initcall6.init) - *(.initcall7.init) + INITCALLS } __initcall_end = .; __con_initcall_start = .; -- cgit v1.2.1 From ae74589cb381cc2838107f92d4e0e1f178c6825d Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Sat, 28 Oct 2006 10:38:29 -0700 Subject: [PATCH] fix efi_memory_present_wrapper() efi_memory_present_wrapper() parameter start/end is physical address, but function memory_present parameter is PFN, this patch converts physical address to PFN. Signed-off-by: bibo, mao Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 519e63c3c130..141041dde74d 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -846,7 +846,7 @@ efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) static int __init efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) { - memory_present(0, start, end); + memory_present(0, PFN_UP(start), PFN_DOWN(end)); return 0; } -- cgit v1.2.1 From 08d892f11aae7125fe078cf93ec5cf6af288c5e7 Mon Sep 17 00:00:00 2001 From: Andrey Panin Date: Sat, 28 Oct 2006 10:38:35 -0700 Subject: [PATCH] visws build fix Fix this: > Subject : CONFIG_X86_VISWS=3Dy, CONFIG_SMP=3Dn compile error > References : http://lkml.org/lkml/2006/10/7/51 > Submitter : Jesper Juhl > Caused-By : David Howells > commit 7d12e780e003f93433d49ce78cfedf4b4c52adc5 > Status : unknown Via undescribed means. Signed-off-by: Andrey Panin Cc: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mach-visws/visws_apic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/mach-visws/visws_apic.c b/arch/i386/mach-visws/visws_apic.c index 07097ed48890..38c2b13124d9 100644 --- a/arch/i386/mach-visws/visws_apic.c +++ b/arch/i386/mach-visws/visws_apic.c @@ -122,7 +122,7 @@ static void end_cobalt_irq(unsigned int irq) spin_unlock_irqrestore(&cobalt_lock, flags); } -static struct hw_interrupt_type cobalt_irq_type = { +static struct irq_chip cobalt_irq_type = { .typename = "Cobalt-APIC", .startup = startup_cobalt_irq, .shutdown = disable_cobalt_irq, @@ -159,7 +159,7 @@ static void end_piix4_master_irq(unsigned int irq) spin_unlock_irqrestore(&cobalt_lock, flags); } -static struct hw_interrupt_type piix4_master_irq_type = { +static struct irq_chip piix4_master_irq_type = { .typename = "PIIX4-master", .startup = startup_piix4_master_irq, .ack = ack_cobalt_irq, @@ -167,9 +167,8 @@ static struct hw_interrupt_type piix4_master_irq_type = { }; -static struct hw_interrupt_type piix4_virtual_irq_type = { +static struct irq_chip piix4_virtual_irq_type = { .typename = "PIIX4-virtual", - .startup = startup_8259A_irq, .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, -- cgit v1.2.1 From 3f4b23e9833f0816b57d07401eac5e184b627fd7 Mon Sep 17 00:00:00 2001 From: Kristian Mueller Date: Sun, 29 Oct 2006 22:46:46 -0800 Subject: [PATCH] APM: URL of APM 1.2 specs has changed APM BIOS Interface Secification can now be found at http://www.microsoft.com/whdc/archive/amp_12.mspx Signed-off-by: Kristian Mueller Acked-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/apm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 2af65858d322..a60358fe9a49 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -198,7 +198,7 @@ * (APM) BIOS Interface Specification, Revision 1.2, February 1996. * * [This document is available from Microsoft at: - * http://www.microsoft.com/hwdev/busbios/amp_12.htm] + * http://www.microsoft.com/whdc/archive/amp_12.mspx] */ #include -- cgit v1.2.1 From 130fe05dbc0114609cfef9815c0c5580b42decfa Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 Nov 2006 09:11:00 -0800 Subject: i386: clean up io-apic accesses This is preparation for fixing the ordering of the accesses that got broken by the commit cf4c6a2f27f5db810b69dcb1da7f194489e8ff88 when factoring out the "common" io apic routing entry accesses. Move the accessor function (that were only used by io_apic.c) out of a header file, and use proper memory-mapped accesses rather than making up our own "volatile" pointers. Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 350192d6ab98..eb10bd5da64e 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -91,6 +91,46 @@ static struct irq_pin_list { int apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); +} + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + volatile struct io_apic *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; -- cgit v1.2.1 From f9dadfa71bc594df09044da61d1c72701121d802 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 Nov 2006 10:05:35 -0800 Subject: i386: write IO APIC irq routing entries in correct order Since the "mask" bit is in the low word, when we write a new entry, we need to write the high word first, before we potentially unmask it. The exception is when we actually want to mask the interrupt, in which case we want to write the low word first to make sure that the high word doesn't change while the interrupt routing is still active. Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index eb10bd5da64e..507983c513c3 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -147,11 +147,33 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) return eu.entry; } +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; union entry_union eu; eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); @@ -274,9 +296,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) /* * Disable it in the IO-APIC irq-routing table: */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); + ioapic_mask_entry(apic, pin); } static void clear_IO_APIC (void) -- cgit v1.2.1 From 6851ecc6e2fa4a01449a0fec9f4abd9aec43afde Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 2 Nov 2006 23:02:24 -0800 Subject: PCI: Revert "PCI: i386/x86_84: disable PCI resource decode on device disable" This reverts commit 53e4d30dd666d7f83598957ee4a415eefb47c9a6. It was found that it caused unneeded problems (see http://bugzilla.kernel.org/show_bug.cgi?id=7082 for details of one such issue. Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/common.c | 1 - arch/i386/pci/i386.c | 9 --------- arch/i386/pci/pci.h | 1 - 3 files changed, 11 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index 6d5ace845e44..cdfcf971098b 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -343,7 +343,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) void pcibios_disable_device (struct pci_dev *dev) { - pcibios_disable_resources(dev); if (pcibios_disable_irq) pcibios_disable_irq(dev); } diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c index 10154a2cac68..98580292f0d4 100644 --- a/arch/i386/pci/i386.c +++ b/arch/i386/pci/i386.c @@ -242,15 +242,6 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask) return 0; } -void pcibios_disable_resources(struct pci_dev *dev) -{ - u16 cmd; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY); - pci_write_config_word(dev, PCI_COMMAND, cmd); -} - /* * If we set up a device for bus mastering, we need to check the latency * timer as certain crappy BIOSes forget to set it properly. diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h index ad065cebd7b9..a0a25180b61a 100644 --- a/arch/i386/pci/pci.h +++ b/arch/i386/pci/pci.h @@ -43,7 +43,6 @@ extern unsigned int pcibios_max_latency; void pcibios_resource_survey(void); int pcibios_enable_resources(struct pci_dev *, int); -void pcibios_disable_resources(struct pci_dev *); /* pci-pc.c */ -- cgit v1.2.1 From 90d53909443b3986569b38ef145f09ea2359af75 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 2 Nov 2006 22:07:18 -0800 Subject: [PATCH] acpi_noirq section fix WARNING: vmlinux - Section mismatch: reference to .init.data:acpi_noirq from .text between 'pcibios_penalize_isa_irq' (at offset 0xc026ffa1) and 'pirq_serverworks_get' Acked-by: "Brown, Len" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/acpi/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index ab974ff97073..22e4c466e5a3 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -70,7 +70,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return #define PREFIX "ACPI: " -int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ +int acpi_noirq; /* skip ACPI IRQ initialization */ int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ int acpi_ht __initdata = 1; /* enable HT */ -- cgit v1.2.1 From de8e7c12430a73654ae3cedbc45428d56c6b777b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Nov 2006 10:09:28 -0800 Subject: Revert "[PATCH] i386: Add MMCFG resources to i386 too" This reverts commit de09bddb9d6f96785be470c832b881e6d72d589f. It tried to reserve the MMCONFIG mmio memory ranges, but since the MMCONFIG information is broken and often bogus (which is why we don't dare use it most of the time _anyway_), it does more harm than good. Cc: Jeff Chua Cc: Adrian Bunk Cc: Andi Kleen Signed-off-by: Linus Torvalds --- arch/i386/pci/mmconfig.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c index d0c3da3aa2aa..c6b6d9bbc453 100644 --- a/arch/i386/pci/mmconfig.c +++ b/arch/i386/pci/mmconfig.c @@ -154,38 +154,6 @@ static struct pci_raw_ops pci_mmcfg = { .write = pci_mmcfg_write, }; - -static __init void pci_mmcfg_insert_resources(void) -{ -#define PCI_MMCFG_RESOURCE_NAME_LEN 19 - int i; - struct resource *res; - char *names; - unsigned num_buses; - - res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), - pci_mmcfg_config_num, GFP_KERNEL); - - if (!res) { - printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); - return; - } - - names = (void *)&res[pci_mmcfg_config_num]; - for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - num_buses = pci_mmcfg_config[i].end_bus_number - - pci_mmcfg_config[i].start_bus_number + 1; - res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", - pci_mmcfg_config[i].pci_segment_group_number); - res->start = pci_mmcfg_config[i].base_address; - res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - names += PCI_MMCFG_RESOURCE_NAME_LEN; - } -} - /* K8 systems have some devices (typically in the builtin northbridge) that are only accessible using type1 Normally this can be expressed in the MCFG by not listing them @@ -222,8 +190,6 @@ static __init void unreachable_devices(void) } } - - void __init pci_mmcfg_init(int type) { if ((pci_probe & PCI_PROBE_MMCONF) == 0) @@ -251,5 +217,4 @@ void __init pci_mmcfg_init(int type) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; unreachable_devices(); - pci_mmcfg_insert_resources(); } -- cgit v1.2.1 From d654c673d6394bc26e159b1057b357371b4ce1dc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 8 Nov 2006 17:44:37 -0800 Subject: [PATCH] Regression in 2.6.19-rc microcode driver If the microcode driver is built in (rather than module) there are some, ehm, interesting effects happening due to the new "call out to userspace" behavior that is introduced.. and which runs too early. The result is a boot hang; which is really nasty. The patch below is a minimally safe patch to fix this regression for 2.6.19 by just not requesting actual microcode updates during early boot. (That is a good idea in general anyway) The "real" fix is a lot more complex given the entire cpu hotplug scenario (during cpu hotplug you normally need to load the microcode as well); but the interactions for that are just really messy at this point; this fix at least makes it work and avoids a full detangle of hotplug. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/microcode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index c4d0291b519f..23f5984d0654 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -577,7 +577,7 @@ static void microcode_init_cpu(int cpu) set_cpus_allowed(current, cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); collect_cpu_info(cpu); - if (uci->valid) + if (uci->valid && system_state == SYSTEM_RUNNING) cpu_request_microcode(cpu); mutex_unlock(µcode_mutex); set_cpus_allowed(current, old); -- cgit v1.2.1 From c06cb8b1c4d25e5b4d7a2d7c2462619de1e0dbc4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 8 Nov 2006 17:44:41 -0800 Subject: [PATCH] i386: Force data segment to be 4K aligned o Currently there is no specific alignment restriction in linker script and in some cases it can be placed non 4K aligned addresses. This fails kexec which checks that segment to be loaded is page aligned. o I guess, it does not harm data segment to be 4K aligned. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index adc1f232afee..c6f84a0322ba 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -51,6 +51,7 @@ SECTIONS __tracedata_end = .; /* writeable */ + . = ALIGN(4096); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) CONSTRUCTORS -- cgit v1.2.1 From 8bdc052eccdc7893d075d3f1f7103594a458c8c4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 8 Nov 2006 17:44:49 -0800 Subject: [PATCH] kretprobe: fix kretprobe-booster to save regs and set status There are two bugs in the kretprobe-booster. 1) It doesn't make room for gs registers. 2) It doesn't change status of the current kprobe. This status will effect the fault handling. This patch fixes these bugs and, additionally, saves skipped registers for compatibility with the original kretprobe. Signed-off-by: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index d98e44b16fe2..fc79e1e859c4 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -361,8 +361,11 @@ no_kprobe: asm volatile ( ".global kretprobe_trampoline\n" "kretprobe_trampoline: \n" " pushf\n" - /* skip cs, eip, orig_eax, es, ds */ - " subl $20, %esp\n" + /* skip cs, eip, orig_eax */ + " subl $12, %esp\n" + " pushl %gs\n" + " pushl %ds\n" + " pushl %es\n" " pushl %eax\n" " pushl %ebp\n" " pushl %edi\n" @@ -373,10 +376,10 @@ no_kprobe: " movl %esp, %eax\n" " call trampoline_handler\n" /* move eflags to cs */ - " movl 48(%esp), %edx\n" - " movl %edx, 44(%esp)\n" + " movl 52(%esp), %edx\n" + " movl %edx, 48(%esp)\n" /* save true return address on eflags */ - " movl %eax, 48(%esp)\n" + " movl %eax, 52(%esp)\n" " popl %ebx\n" " popl %ecx\n" " popl %edx\n" @@ -384,8 +387,8 @@ no_kprobe: " popl %edi\n" " popl %ebp\n" " popl %eax\n" - /* skip eip, orig_eax, es, ds */ - " addl $16, %esp\n" + /* skip eip, orig_eax, es, ds, gs */ + " addl $20, %esp\n" " popf\n" " ret\n"); } @@ -404,6 +407,10 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); + /* fixup registers */ + regs->xcs = __KERNEL_CS; + regs->eip = trampoline_address; + regs->orig_eax = 0xffffffff; /* * It is possible to have multiple instances associated with a given @@ -425,6 +432,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) if (ri->rp && ri->rp->handler){ __get_cpu_var(current_kprobe) = &ri->rp->kp; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->rp->handler(ri, regs); __get_cpu_var(current_kprobe) = NULL; } -- cgit v1.2.1 From ec68307cc5a8dc499e48693843bb42f6b6028458 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 8 Nov 2006 17:44:57 -0800 Subject: [PATCH] htirq: refactor so we only have one function that writes to the chip This refactoring actually optimizes the code a little by caching the value that we think the device is programmed with instead of reading it back from the hardware. Which simplifies the code a little and should speed things up a bit. This patch introduces the concept of a ht_irq_msg and modifies the architecture read/write routines to update this code. There is a minor consistency fix here as well as x86_64 forgot to initialize the htirq as masked. Signed-off-by: Eric W. Biederman Cc: Andi Kleen Acked-by: Bryan O'Sullivan Cc: Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 507983c513c3..ad84bc2802a6 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -2624,18 +2624,16 @@ void arch_teardown_msi_irq(unsigned int irq) static void target_ht_irq(unsigned int irq, unsigned int dest) { - u32 low, high; - low = read_ht_irq_low(irq); - high = read_ht_irq_high(irq); + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); - low &= ~(HT_IRQ_LOW_DEST_ID_MASK); - high &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - low |= HT_IRQ_LOW_DEST_ID(dest); - high |= HT_IRQ_HIGH_DEST_ID(dest); + msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - write_ht_irq_low(irq, low); - write_ht_irq_high(irq, high); + write_ht_irq_msg(irq, &msg); } static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) @@ -2673,7 +2671,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) vector = assign_irq_vector(irq); if (vector >= 0) { - u32 low, high; + struct ht_irq_msg msg; unsigned dest; cpumask_t tmp; @@ -2681,9 +2679,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) cpu_set(vector >> 8, tmp); dest = cpu_mask_to_apicid(tmp); - high = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - low = HT_IRQ_LOW_BASE | + msg.address_lo = + HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(dest) | HT_IRQ_LOW_VECTOR(vector) | ((INT_DEST_MODE == 0) ? @@ -2695,8 +2694,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_MT_ARBITRATED) | HT_IRQ_LOW_IRQ_MASKED; - write_ht_irq_low(irq, low); - write_ht_irq_high(irq, high); + write_ht_irq_msg(irq, &msg); set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); -- cgit v1.2.1 From fa18f477d0987c011cce047a7c3cd1284f547a14 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 14 Nov 2006 16:57:46 +0100 Subject: [PATCH] x86: Add acpi_user_timer_override option for Asus boards Timer overrides are normally disabled on Nvidia board because they are commonly wrong, except on new ones with HPET support. Unfortunately there are quite some Asus boards around that don't have HPET, but need a timer override. We don't know yet how to handle this transparently, but at least add a command line option to force the timer override and let them boot. Cc: len.brown@intel.com Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/boot.c | 8 ++++++++ arch/i386/kernel/acpi/earlyquirk.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 22e4c466e5a3..d12fb97a5337 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -82,6 +82,7 @@ EXPORT_SYMBOL(acpi_strict); acpi_interrupt_flags acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; +int acpi_use_timer_override __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; @@ -1300,6 +1301,13 @@ static int __init parse_acpi_skip_timer_override(char *arg) return 0; } early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); + +static int __init parse_acpi_use_timer_override(char *arg) +{ + acpi_use_timer_override = 1; + return 0; +} +early_param("acpi_use_timer_override", parse_acpi_use_timer_override); #endif /* CONFIG_X86_IO_APIC */ static int __init setup_acpi_sci(char *s) diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index fe799b11ac0a..c9841692bb7c 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -27,11 +27,17 @@ static int __init check_bridge(int vendor, int device) #ifdef CONFIG_ACPI /* According to Nvidia all timer overrides are bogus unless HPET is enabled. */ - if (vendor == PCI_VENDOR_ID_NVIDIA) { + if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) { nvidia_hpet_detected = 0; acpi_table_parse(ACPI_HPET, nvidia_hpet_check); if (nvidia_hpet_detected == 0) { acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } } #endif -- cgit v1.2.1 From f3ac84324fd949f671e6cf5620add5de02498386 Mon Sep 17 00:00:00 2001 From: Daniel Ritz Date: Tue, 14 Nov 2006 02:03:25 -0800 Subject: [PATCH] fix via586 irq routing for pirq 5 Fix interrupt routing for via 586 bridges. pirq can be 5 which needs to be mapped to INTD. But currently the access functions can handle only pirq 1-4. this is similar to the other via chipsets where pirq 4 and 5 are both mapped to INTD. Fixes bugzilla #7490 Cc: Daniel Paschka Cc: Adrian Bunk Signed-off-by: Daniel Ritz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/pci/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c index dbc4aae91959..69163998adeb 100644 --- a/arch/i386/pci/irq.c +++ b/arch/i386/pci/irq.c @@ -255,13 +255,13 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i */ static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { - static const unsigned int pirqmap[4] = { 3, 2, 5, 1 }; + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; return read_config_nybble(router, 0x55, pirqmap[pirq-1]); } static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - static const unsigned int pirqmap[4] = { 3, 2, 5, 1 }; + static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); return 1; } -- cgit v1.2.1 From 45c99533252ef2297f37c5fdd672a3e0eb566870 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 14 Nov 2006 10:52:12 -0700 Subject: [PATCH] Use delayed disable mode of ioapic edge triggered interrupts Komuro reports that ISA interrupts do not work after a disable_irq(), causing some PCMCIA drivers to not work, with messages like eth0: Asix AX88190: io 0x300, irq 3, hw_addr xx:xx:xx:xx:xx:xx eth0: found link beat eth0: autonegotiation complete: 100baseT-FD selected eth0: interrupt(s) dropped! eth0: interrupt(s) dropped! eth0: interrupt(s) dropped! ... Linus Torvalds said: "Now, edge-triggered interrupts are a _lot_ harder to mask, because the Intel APIC is an unbelievable piece of sh*t, and has the edge-detect logic _before_ the mask logic, so if a edge happens _while_ the device is masked, you'll never ever see the edge ever again (unmasking will not cause a new edge, so you simply lost the interrupt). So when you "mask" an edge-triggered IRQ, you can't really mask it at all, because if you did that, you'd lose it forever if the IRQ comes in while you masked it. Instead, we're supposed to leave it active, and set a flag, and IF the IRQ comes in, we just remember it, and mask it at that point instead, and then on unmasking, we have to replay it by sending a self-IPI." This trivial patch solves the problem. Signed-off-by: Eric W. Biederman Cc: Ingo Molnar Acked-by: Komuro Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index ad84bc2802a6..3b7a63e0ed1a 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1287,9 +1287,11 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + else { + irq_desc[irq].status |= IRQ_DELAYED_DISABLE; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } set_intr_gate(vector, interrupt[irq]); } -- cgit v1.2.1 From dc1829a4c378d793fb3b95d56135d89a0d7ff72a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Nov 2006 14:26:18 +0100 Subject: [PATCH] i386/x86_64: ACPI cpu_idle_wait() fix The scheduler on Andreas Friedrich's hyperthreading system stopped working properly: the scheduler would never move tasks to another CPU! The lask known working kernel was 2.6.8. After a couple of attempts to corner the bug, the following smoking gun was found: BIOS reported wrong ACPI idfor the processor CPU#1: set_cpus_allowed(), swapper:1, 3 -> 2 [] show_trace_log_lvl+0x34/0x4a [] show_trace+0x2c/0x2e [] dump_stack+0x2b/0x2d [] set_cpus_allowed+0x52/0xec [] cpu_idle_wait+0x2e/0x100 [] acpi_processor_power_exit+0x45/0x58 [] acpi_processor_remove+0x46/0xea [] acpi_start_single_object+0x47/0x54 [] acpi_bus_register_driver+0xa4/0xd3 [] acpi_processor_init+0x57/0x77 [] init+0x146/0x2fd [] kernel_thread_helper+0x7/0x10 a quick look at cpu_idle_wait() shows how broken that code is on i386: it changes the init task's affinity map but never restores it ... and because all userspace tasks get forked by init, they all inherited that single-CPU affinity mask. x86_64 cloned this bug too. Signed-off-by: Ingo Molnar Cc: Andreas Friedrich Cc: Wolfgang Erig Cc: Andrew Morton Cc: Adrian Bunk Signed-off-by: Linus Torvalds --- arch/i386/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 1e1fa3e391a3..dd53c58f64f1 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -205,7 +205,7 @@ void cpu_idle(void) void cpu_idle_wait(void) { unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map; + cpumask_t map, tmp = current->cpus_allowed; set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); put_cpu(); @@ -227,6 +227,8 @@ void cpu_idle_wait(void) } cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -- cgit v1.2.1 From 808dbbb6bb61173bf52946a28f99089d2efa4c55 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 17 Nov 2006 11:14:56 -0800 Subject: x86: be more careful when walking back the frame pointer chain When showing the stack backtrace, make sure that we never accept not only an unchanging frame pointer, but also a frame pointer that moves back down the stack frame. It must always grow up (toward older stack frames). I doubt this has triggered, but a subtly corrupt stack with extremely unlucky contents could cause us to loop forever on a bogus endless frame pointer chain. This review was triggered by much worse problems happening in some of the other stack unwinding code. Signed-off-by: Linus Torvalds --- arch/i386/kernel/traps.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 00489b706d27..fe9c5e8e7e6f 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -129,15 +129,19 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, #ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { + unsigned long new_ebp; addr = *(unsigned long *)(ebp + 4); ops->address(data, addr); /* * break out of recursive entries (such as - * end_of_stack_stop_unwind_function): + * end_of_stack_stop_unwind_function). Also, + * we can never allow a frame pointer to + * move downwards! */ - if (ebp == *(unsigned long *)ebp) + new_ebp = *(unsigned long *)ebp; + if (new_ebp <= ebp) break; - ebp = *(unsigned long *)ebp; + ebp = new_ebp; } #else while (valid_stack_ptr(tinfo, stack)) { -- cgit v1.2.1 From c4028958b6ecad064b1a6303a6a5906d4fe48d73 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2006 14:57:56 +0000 Subject: WorkStruct: make allyesconfig Fix up for make allyesconfig. Signed-Off-By: David Howells --- arch/i386/kernel/cpu/mcheck/non-fatal.c | 6 +++--- arch/i386/kernel/smpboot.c | 11 ++++++----- arch/i386/kernel/tsc.c | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c index 1f9153ae5b03..6b5d3518a1c0 100644 --- a/arch/i386/kernel/cpu/mcheck/non-fatal.c +++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c @@ -51,10 +51,10 @@ static void mce_checkregs (void *info) } } -static void mce_work_fn(void *data); -static DECLARE_WORK(mce_work, mce_work_fn, NULL); +static void mce_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); -static void mce_work_fn(void *data) +static void mce_work_fn(struct work_struct *work) { on_each_cpu(mce_checkregs, NULL, 1, 1); schedule_delayed_work(&mce_work, MCE_RATE); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4bb8b77cd65b..02a9b66b6ac3 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1049,13 +1049,15 @@ void cpu_exit_clear(void) struct warm_boot_cpu_info { struct completion *complete; + struct work_struct task; int apicid; int cpu; }; -static void __cpuinit do_warm_boot_cpu(void *p) +static void __cpuinit do_warm_boot_cpu(struct work_struct *work) { - struct warm_boot_cpu_info *info = p; + struct warm_boot_cpu_info *info = + container_of(work, struct warm_boot_cpu_info, task); do_boot_cpu(info->apicid, info->cpu); complete(info->complete); } @@ -1064,7 +1066,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu) { DECLARE_COMPLETION_ONSTACK(done); struct warm_boot_cpu_info info; - struct work_struct task; int apicid, ret; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); @@ -1089,7 +1090,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu) info.complete = &done; info.apicid = apicid; info.cpu = cpu; - INIT_WORK(&task, do_warm_boot_cpu, &info); + INIT_WORK(&info.task, do_warm_boot_cpu); tsc_sync_disabled = 1; @@ -1097,7 +1098,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu) clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, KERNEL_PGD_PTRS); flush_tlb_all(); - schedule_work(&task); + schedule_work(&info.task); wait_for_completion(&done); tsc_sync_disabled = 0; diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index fbc95828cd74..9810c8c90750 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -217,7 +217,7 @@ static unsigned int cpufreq_delayed_issched = 0; static unsigned int cpufreq_init = 0; static struct work_struct cpufreq_delayed_get_work; -static void handle_cpufreq_delayed_get(void *v) +static void handle_cpufreq_delayed_get(struct work_struct *work) { unsigned int cpu; @@ -306,7 +306,7 @@ static int __init cpufreq_tsc(void) { int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); if (!ret) -- cgit v1.2.1 From 368c73d4f689dae0807d0a2aa74c61fd2b9b075f Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 4 Oct 2006 00:41:26 +0100 Subject: PCI: quirks: fix the festering mess that claims to handle IDE quirks The number of permutations of crap we do is amazing and almost all of it has the wrong effect in 2.6. At the heart of this is the PCI SFF magic which says that compatibility mode PCI IDE controllers use ISA IRQ routing and hard coded addresses not the BAR values. The old quirks variously clears them, sets them, adjusts them and then IDE ignores the result. In order to drive all this garbage out and to do it portably we need to handle the SFF rules directly and properly. Because we know the device BAR 0-3 are not used in compatibility mode we load them with the values that are implied (and indeed which many controllers actually thoughtfully put there in this mode anyway). This removes special cases in the IDE layer and libata which now knows that bar 0/1/2/3 always contain the correct address. It means our resource allocation map is accurate from boot, not "mostly accurate" after ide is loaded, and it shoots lots of code. There is also lots more code and magic constant knowledge to shoot once this is in and settled. Been in my test tree for a while both with drivers/ide and with libata. Wants some -mm shakedown in case I've missed something dumb or there are corner cases lurking. Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/fixup.c | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/fixup.c b/arch/i386/pci/fixup.c index c1949ff38d61..cde1170b01a1 100644 --- a/arch/i386/pci/fixup.c +++ b/arch/i386/pci/fixup.c @@ -74,52 +74,6 @@ static void __devinit pci_fixup_ncr53c810(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810); -static void __devinit pci_fixup_ide_bases(struct pci_dev *d) -{ - int i; - - /* - * PCI IDE controllers use non-standard I/O port decoding, respect it. - */ - if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE) - return; - DBG("PCI: IDE base address fixup for %s\n", pci_name(d)); - for(i=0; i<4; i++) { - struct resource *r = &d->resource[i]; - if ((r->start & ~0x80) == 0x374) { - r->start |= 2; - r->end = r->start; - } - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases); - -static void __devinit pci_fixup_ide_trash(struct pci_dev *d) -{ - int i; - - /* - * Runs the fixup only for the first IDE controller - * (Shai Fultheim - shai@ftcon.com) - */ - static int called = 0; - if (called) - return; - called = 1; - - /* - * There exist PCI IDE controllers which have utter garbage - * in first four base registers. Ignore that. - */ - DBG("PCI: IDE base address trash cleared for %s\n", pci_name(d)); - for(i=0; i<4; i++) - d->resource[i].start = d->resource[i].end = d->resource[i].flags = 0; -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5513, pci_fixup_ide_trash); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_10, pci_fixup_ide_trash); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_11, pci_fixup_ide_trash); -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_9, pci_fixup_ide_trash); - static void __devinit pci_fixup_latency(struct pci_dev *d) { /* -- cgit v1.2.1 From 7edab2f0876ff6a38e10a88c9aca20180aad307c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 17 Oct 2006 10:17:58 -0700 Subject: pci/i386: style cleanups Mostly CodingStyle cleanups for arch/i386/pci/i386.c: - fit in 80 columns; - use a #defined value instead of an inline constant; Also change one resource_size_t (DBG) printk from %08lx to %lx since it can be more than 32 bits (more than 8 hexits). Signed-off-by: Randy Dunlap Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/i386.c | 64 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 21 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/i386.c b/arch/i386/pci/i386.c index 98580292f0d4..43005f044424 100644 --- a/arch/i386/pci/i386.c +++ b/arch/i386/pci/i386.c @@ -104,16 +104,24 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) /* Depth-First Search on bus tree */ list_for_each_entry(bus, bus_list, node) { if ((dev = bus->self)) { - for (idx = PCI_BRIDGE_RESOURCES; idx < PCI_NUM_RESOURCES; idx++) { + for (idx = PCI_BRIDGE_RESOURCES; + idx < PCI_NUM_RESOURCES; idx++) { r = &dev->resource[idx]; if (!r->flags) continue; pr = pci_find_parent_resource(dev, r); - if (!r->start || !pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Cannot allocate resource region %d of bridge %s\n", idx, pci_name(dev)); - /* Something is wrong with the region. - Invalidate the resource to prevent child - resource allocations in this range. */ + if (!r->start || !pr || + request_resource(pr, r) < 0) { + printk(KERN_ERR "PCI: Cannot allocate " + "resource region %d " + "of bridge %s\n", + idx, pci_name(dev)); + /* + * Something is wrong with the region. + * Invalidate the resource to prevent + * child resource allocations in this + * range. + */ r->flags = 0; } } @@ -131,7 +139,7 @@ static void __init pcibios_allocate_resources(int pass) for_each_pci_dev(dev) { pci_read_config_word(dev, PCI_COMMAND, &command); - for(idx = 0; idx < 6; idx++) { + for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; @@ -142,11 +150,15 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - DBG("PCI: Resource %08lx-%08lx (f=%lx, d=%d, p=%d)\n", + DBG("PCI: Resource %08lx-%08lx " + "(f=%lx, d=%d, p=%d)\n", r->start, r->end, r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Cannot allocate resource region %d of device %s\n", idx, pci_name(dev)); + printk(KERN_ERR "PCI: Cannot allocate " + "resource region %d " + "of device %s\n", + idx, pci_name(dev)); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -156,12 +168,16 @@ static void __init pcibios_allocate_resources(int pass) if (!pass) { r = &dev->resource[PCI_ROM_RESOURCE]; if (r->flags & IORESOURCE_ROM_ENABLE) { - /* Turn the ROM off, leave the resource region, but keep it unregistered. */ + /* Turn the ROM off, leave the resource region, + * but keep it unregistered. */ u32 reg; - DBG("PCI: Switching off ROM of %s\n", pci_name(dev)); + DBG("PCI: Switching off ROM of %s\n", + pci_name(dev)); r->flags &= ~IORESOURCE_ROM_ENABLE; - pci_read_config_dword(dev, dev->rom_base_reg, ®); - pci_write_config_dword(dev, dev->rom_base_reg, reg & ~PCI_ROM_ADDRESS_ENABLE); + pci_read_config_dword(dev, + dev->rom_base_reg, ®); + pci_write_config_dword(dev, dev->rom_base_reg, + reg & ~PCI_ROM_ADDRESS_ENABLE); } } } @@ -173,9 +189,11 @@ static int __init pcibios_assign_resources(void) struct resource *r, *pr; if (!(pci_probe & PCI_ASSIGN_ROMS)) { - /* Try to use BIOS settings for ROMs, otherwise let - pci_assign_unassigned_resources() allocate the new - addresses. */ + /* + * Try to use BIOS settings for ROMs, otherwise let + * pci_assign_unassigned_resources() allocate the new + * addresses. + */ for_each_pci_dev(dev) { r = &dev->resource[PCI_ROM_RESOURCE]; if (!r->flags || !r->start) @@ -215,9 +233,9 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask) pci_read_config_word(dev, PCI_COMMAND, &cmd); old_cmd = cmd; - for(idx = 0; idx < PCI_NUM_RESOURCES; idx++) { + for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) { /* Only set up the requested stuff */ - if (!(mask & (1<resource[idx]; @@ -227,7 +245,9 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask) (!(r->flags & IORESOURCE_ROM_ENABLE))) continue; if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available because of resource collisions\n", pci_name(dev)); + printk(KERN_ERR "PCI: Device %s not available " + "because of resource collisions\n", + pci_name(dev)); return -EINVAL; } if (r->flags & IORESOURCE_IO) @@ -236,7 +256,8 @@ int pcibios_enable_resources(struct pci_dev *dev, int mask) cmd |= PCI_COMMAND_MEMORY; } if (cmd != old_cmd) { - printk("PCI: Enabling device %s (%04x -> %04x)\n", pci_name(dev), old_cmd, cmd); + printk("PCI: Enabling device %s (%04x -> %04x)\n", + pci_name(dev), old_cmd, cmd); pci_write_config_word(dev, PCI_COMMAND, cmd); } return 0; @@ -258,7 +279,8 @@ void pcibios_set_master(struct pci_dev *dev) lat = pcibios_max_latency; else return; - printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", pci_name(dev), lat); + printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", + pci_name(dev), lat); pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); } -- cgit v1.2.1 From 039d09a845209122c5193e650ab2d8b3c849ca7c Mon Sep 17 00:00:00 2001 From: Amol Lad Date: Tue, 17 Oct 2006 10:02:50 +0530 Subject: PCI: arch/i386/kernel/pci-dma.c: ioremap balanced with iounmap ioremap must be balanced by an iounmap and failing to do so can result in a memory leak. Tested (compilation only): - using allmodconfig - making sure the files are compiling without any warning/error due to new changes Signed-off-by: Amol Lad Signed-off-by: Greg Kroah-Hartman --- arch/i386/kernel/pci-dma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c index 25fe66853934..5c8c6ef1fc5e 100644 --- a/arch/i386/kernel/pci-dma.c +++ b/arch/i386/kernel/pci-dma.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(dma_free_coherent); int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dma_addr_t device_addr, size_t size, int flags) { - void __iomem *mem_base; + void __iomem *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = (pages + 31)/32; @@ -114,6 +114,8 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, free1_out: kfree(dev->dma_mem->bitmap); out: + if (mem_base) + iounmap(mem_base); return 0; } EXPORT_SYMBOL(dma_declare_coherent_memory); -- cgit v1.2.1 From 3b59d52d8c7925e7a9a396f2e31a66eb060c6c37 Mon Sep 17 00:00:00 2001 From: Jason Gaston Date: Wed, 22 Nov 2006 15:15:08 -0800 Subject: PCI: irq: irq and pci_ids patch for Intel ICH9 This updated patch adds the Intel ICH9 LPC and SMBus Controller DID's. Signed-off-by: Jason Gaston Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/irq.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c index 69163998adeb..e65551cd8216 100644 --- a/arch/i386/pci/irq.c +++ b/arch/i386/pci/irq.c @@ -543,6 +543,12 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH8_2: case PCI_DEVICE_ID_INTEL_ICH8_3: case PCI_DEVICE_ID_INTEL_ICH8_4: + case PCI_DEVICE_ID_INTEL_ICH9_0: + case PCI_DEVICE_ID_INTEL_ICH9_1: + case PCI_DEVICE_ID_INTEL_ICH9_2: + case PCI_DEVICE_ID_INTEL_ICH9_3: + case PCI_DEVICE_ID_INTEL_ICH9_4: + case PCI_DEVICE_ID_INTEL_ICH9_5: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; -- cgit v1.2.1 From 2b290da053608692ea206507d993b70c39d2cdea Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 16 Nov 2006 13:16:23 +0100 Subject: PCI: make arch/i386/pci/common.c:pci_bf_sort static This patch makes the needlessly global pci_bf_sort static. Signed-off-by: Adrian Bunk Acked-by: Matt Domsch Signed-off-by: Greg Kroah-Hartman --- arch/i386/pci/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c index cdfcf971098b..53ca6e897984 100644 --- a/arch/i386/pci/common.c +++ b/arch/i386/pci/common.c @@ -20,7 +20,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; -int pci_bf_sort; +static int pci_bf_sort; int pci_routeirq; int pcibios_last_bus = -1; unsigned long pirq_table_addr; -- cgit v1.2.1 From a271aaf15f492d202def1b34e447107b60fe4ece Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 7 Aug 2006 22:19:37 -0700 Subject: Driver core: convert msr code to use struct device Converts from using struct "class_device" to "struct device" making everything show up properly in /sys/devices/ with symlinks from the /sys/class directory. Signed-off-by: Greg Kroah-Hartman --- arch/i386/kernel/msr.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index d535cdbbfd25..a773f776c9ea 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -239,14 +239,14 @@ static struct file_operations msr_fops = { .open = msr_open, }; -static int msr_class_device_create(int i) +static int msr_device_create(int i) { int err = 0; - struct class_device *class_err; + struct device *dev; - class_err = class_device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), NULL, "msr%d",i); - if (IS_ERR(class_err)) - err = PTR_ERR(class_err); + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i); + if (IS_ERR(dev)) + err = PTR_ERR(dev); return err; } @@ -258,10 +258,10 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: - msr_class_device_create(cpu); + msr_device_create(cpu); break; case CPU_DEAD: - class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); break; } return NOTIFY_OK; @@ -290,7 +290,7 @@ static int __init msr_init(void) goto out_chrdev; } for_each_online_cpu(i) { - err = msr_class_device_create(i); + err = msr_device_create(i); if (err != 0) goto out_class; } @@ -302,7 +302,7 @@ static int __init msr_init(void) out_class: i = 0; for_each_online_cpu(i) - class_device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); + device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); class_destroy(msr_class); out_chrdev: unregister_chrdev(MSR_MAJOR, "cpu/msr"); @@ -314,7 +314,7 @@ static void __exit msr_exit(void) { int cpu = 0; for_each_online_cpu(cpu) - class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); class_destroy(msr_class); unregister_chrdev(MSR_MAJOR, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); -- cgit v1.2.1 From 07accdc18e014cad945013886ee34e09f859f88a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 7 Aug 2006 22:19:37 -0700 Subject: Driver core: convert cpuid code to use struct device Converts from using struct "class_device" to "struct device" making everything show up properly in /sys/devices/ with symlinks from the /sys/class directory. Signed-off-by: Greg Kroah-Hartman --- arch/i386/kernel/cpuid.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index fde8bea85cee..ab0c327e79dc 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -156,14 +156,14 @@ static struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int cpuid_class_device_create(int i) +static int cpuid_device_create(int i) { int err = 0; - struct class_device *class_err; + struct device *dev; - class_err = class_device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), NULL, "cpu%d",i); - if (IS_ERR(class_err)) - err = PTR_ERR(class_err); + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i); + if (IS_ERR(dev)) + err = PTR_ERR(dev); return err; } @@ -174,10 +174,10 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac switch (action) { case CPU_ONLINE: - cpuid_class_device_create(cpu); + cpuid_device_create(cpu); break; case CPU_DEAD: - class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); break; } return NOTIFY_OK; @@ -206,7 +206,7 @@ static int __init cpuid_init(void) goto out_chrdev; } for_each_online_cpu(i) { - err = cpuid_class_device_create(i); + err = cpuid_device_create(i); if (err != 0) goto out_class; } @@ -218,7 +218,7 @@ static int __init cpuid_init(void) out_class: i = 0; for_each_online_cpu(i) { - class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); } class_destroy(cpuid_class); out_chrdev: @@ -232,7 +232,7 @@ static void __exit cpuid_exit(void) int cpu = 0; for_each_online_cpu(cpu) - class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); class_destroy(cpuid_class); unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); -- cgit v1.2.1 From 7bdd21cef9e5dbc3d3a718c55bb3d0da024644da Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 2 Dec 2006 02:27:46 -0500 Subject: Revert "ACPI: SCI interrupt source override" This reverts commit 281ea49b0c294649a6de47a6f8fbe5611137726b, which broke ACPI Interrupt source overrides that move the SCI from one IRQ in PIC mode to another in IOAPIC mode. If the SCI shared an interrupt line with another device, this would result in a "irq 18: nobody cared" type failure. http://bugzilla.kernel.org/show_bug.cgi?id=7601 Signed-off-by: Len Brown --- arch/i386/kernel/acpi/boot.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index d12fb97a5337..c8f96cff07c6 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -333,7 +333,7 @@ acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end) /* * Parse Interrupt Source Override for the ACPI SCI */ -static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigger) +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) { if (trigger == 0) /* compatible SCI trigger is level */ trigger = 3; @@ -353,13 +353,13 @@ static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigge * If GSI is < 16, this will update its flags, * else it will create a new mp_irqs[] entry. */ - mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + mp_override_legacy_irq(gsi, polarity, trigger, gsi); /* * stash over-ride to indicate we've been here * and for later update of acpi_fadt */ - acpi_sci_override_gsi = bus_irq; + acpi_sci_override_gsi = gsi; return; } @@ -377,7 +377,7 @@ acpi_parse_int_src_ovr(acpi_table_entry_header * header, acpi_table_print_madt_entry(header); if (intsrc->bus_irq == acpi_fadt.sci_int) { - acpi_sci_ioapic_setup(intsrc->bus_irq, intsrc->global_irq, + acpi_sci_ioapic_setup(intsrc->global_irq, intsrc->flags.polarity, intsrc->flags.trigger); return 0; @@ -880,7 +880,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) * pretend we got one so we can set the SCI flags. */ if (!acpi_sci_override_gsi) - acpi_sci_ioapic_setup(acpi_fadt.sci_int, acpi_fadt.sci_int, 0, 0); + acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); /* Fill in identity legacy mapings where no override */ mp_config_acpi_legacy_irqs(); -- cgit v1.2.1 From f6a570333e554b48ad589e7137c77c57809eee81 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Oct 2006 01:47:25 -0400 Subject: [PATCH] severing module.h->sched.h Signed-off-by: Al Viro --- arch/i386/kernel/alternative.c | 1 + arch/i386/kernel/cpu/mcheck/therm_throt.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 583c238e17fb..535f9794fba1 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index 2d8703b7ce65..bad8b4420709 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* How long to wait between reporting thermal events */ -- cgit v1.2.1 From 914e26379decf1fd984b22e51fd2e4209b7a7f1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Oct 2006 13:55:46 -0400 Subject: [PATCH] severing fs.h, radix-tree.h -> sched.h Signed-off-by: Al Viro --- arch/i386/kernel/acpi/cstate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c index 20563e52c622..4664b55f623e 100644 --- a/arch/i386/kernel/acpi/cstate.c +++ b/arch/i386/kernel/acpi/cstate.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include -- cgit v1.2.1 From dc3d1742543fffc79dc4d680ab64d2059e97d809 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] i386: Update defconfig Signed-off-by: Andi Kleen --- arch/i386/defconfig | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 97aacd6bd7d8..65891f11aced 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.19-rc2-git4 -# Sat Oct 21 03:38:56 2006 +# Linux kernel version: 2.6.19-git7 +# Wed Dec 6 23:50:49 2006 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -40,13 +40,14 @@ CONFIG_POSIX_MQUEUE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set +CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y -# CONFIG_SYSCTL_SYSCALL is not set +CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set @@ -110,6 +111,7 @@ CONFIG_SMP=y # CONFIG_X86_VISWS is not set CONFIG_X86_GENERICARCH=y # CONFIG_X86_ES7000 is not set +# CONFIG_PARAVIRT is not set CONFIG_X86_CYCLONE_TIMER=y # CONFIG_M386 is not set # CONFIG_M486 is not set @@ -120,6 +122,7 @@ CONFIG_X86_CYCLONE_TIMER=y # CONFIG_MPENTIUMII is not set CONFIG_MPENTIUMIII=y # CONFIG_MPENTIUMM is not set +# CONFIG_MCORE2 is not set # CONFIG_MPENTIUM4 is not set # CONFIG_MK6 is not set # CONFIG_MK7 is not set @@ -197,7 +200,6 @@ CONFIG_RESOURCES_64BIT=y CONFIG_MTRR=y # CONFIG_EFI is not set # CONFIG_IRQBALANCE is not set -CONFIG_REGPARM=y CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y @@ -205,7 +207,8 @@ CONFIG_HZ_250=y CONFIG_HZ=250 # CONFIG_KEXEC is not set # CONFIG_CRASH_DUMP is not set -CONFIG_PHYSICAL_START=0x100000 +# CONFIG_RELOCATABLE is not set +CONFIG_PHYSICAL_ALIGN=0x100000 # CONFIG_HOTPLUG_CPU is not set CONFIG_COMPAT_VDSO=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y @@ -367,6 +370,7 @@ CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set CONFIG_TCP_CONG_CUBIC=y CONFIG_DEFAULT_TCP_CONG="cubic" +# CONFIG_TCP_MD5SIG is not set CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set @@ -677,6 +681,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_IT821X is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MARVELL is not set # CONFIG_PATA_MPIIX is not set # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set @@ -850,6 +855,7 @@ CONFIG_BNX2=y # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set +# CONFIG_NETXEN_NIC is not set # # Token Ring devices @@ -984,10 +990,6 @@ CONFIG_RTC=y # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set # CONFIG_SONYPI is not set - -# -# Ftape, the floppy tape device driver -# CONFIG_AGP=y # CONFIG_AGP_ALI is not set # CONFIG_AGP_ATI is not set @@ -1108,6 +1110,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1185,6 +1188,7 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_KAWETH is not set # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set +# CONFIG_USB_USBNET_MII is not set # CONFIG_USB_USBNET is not set CONFIG_USB_MON=y -- cgit v1.2.1 From 9b483417527f2e47985856867c5716df013227c7 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] i386: fix buggy MTRR address checks Fix checks that failed to realize that values are 4-kB-unit-sized (note the format strings in this same diff context which *do* realize the unit size, via appended "000"!). Also fix an incorrect below-1MB area check (as gathered from Jan Beulich's unapplied patch at http://www.ussg.iu.edu/hypermail/linux/kernel/0411.1/1378.html ) Update mtrr_add_page() docu to make 4-kB-sized calculation more obvious. Given several further items mentioned in Jan's patch mail, all in all MTRR code seems surprisingly buggy, for a surprisingly long period of time (many years). Further work/investigation would be useful. TBD Note that my patch is pretty much UNTESTED, since I can only verify that it TBD successfully boots my machine, but I cannot test against actual buggy TBD hardware which would require these (formerly broken) checks. Long -mm TBD simmering would make sense, especially since these now-working checks might TBD turn out to have adverse effects on unaffected hardware. Signed-off-by: Andreas Mohr Signed-off-by: Andi Kleen Acked-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/mtrr/generic.c | 4 ++-- arch/i386/kernel/cpu/mtrr/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index 0b61eed8bbd8..ee8dc6753953 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -366,7 +366,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } - if (!(base + size < 0x70000000 || base > 0x7003FFFF) && + if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); @@ -374,7 +374,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base + size < 0x100) { + if (base < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index fff90bda4733..2b8b0b361ccb 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -263,8 +263,8 @@ static void set_mtrr(unsigned int reg, unsigned long base, /** * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (4 KB) - * @size: Physical size of region in pages (4 KB) + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) * @type: Type of MTRR desired * @increment: If this is true do usage counting on the region * -- cgit v1.2.1 From b615ebdac97c648a2ae7d23c5a0bbb3972adf928 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] x86: shorten lines in unwinder to be <= 80 characters Andrew complained about > 80 character lines in the new unwinder. Fix that. Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index fe9c5e8e7e6f..fe81d89c50d3 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -173,6 +173,8 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data) return n; } +#define MSG(msg) ops->warning(data, msg) + void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, struct stacktrace_ops *ops, void *data) @@ -191,29 +193,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (unwind_init_frame_info(&info, task, regs) == 0) unw_ret = dump_trace_unwind(&info, &oad); } else if (task == current) - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); + unw_ret = unwind_init_running(&info, dump_trace_unwind, + &oad); else { if (unwind_init_blocked(&info, task) == 0) unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, + "DWARF2 unwinder stuck at %s\n", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - ops->warning(data, "Leftover inexact backtrace:\n"); + MSG("Leftover inexact backtrace:\n"); stack = (void *)UNW_SP(&info); if (!stack) return; ebp = UNW_FP(&info); } else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:\n"); } else if (call_trace >= 1) return; else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:\n"); } else - ops->warning(data, "Inexact backtrace:\n"); + MSG("Inexact backtrace:\n"); } if (!stack) { unsigned long dummy; -- cgit v1.2.1 From dd315df1767cf56bd4fb8d730fdff4a3d7e15d84 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] x86: Compress stack unwinder output The unwinder has some extra newlines, which eat up loads of screen space when it spews. (See https://bugzilla.redhat.com/bugzilla/attachment.cgi?id=137900 for a nasty example). warning_symbol-> and warning-> already printk a newline, so don't add one in the strings passed to them. [AK: redone for new code] Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index fe81d89c50d3..396041a46e3d 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -202,22 +202,22 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { ops->warning_symbol(data, - "DWARF2 unwinder stuck at %s\n", + "DWARF2 unwinder stuck at %s", UNW_PC(&info)); if (UNW_SP(&info) >= PAGE_OFFSET) { - MSG("Leftover inexact backtrace:\n"); + MSG("Leftover inexact backtrace:"); stack = (void *)UNW_SP(&info); if (!stack) return; ebp = UNW_FP(&info); } else - MSG("Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else if (call_trace >= 1) return; else - MSG("Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else - MSG("Inexact backtrace:\n"); + MSG("Inexact backtrace:"); } if (!stack) { unsigned long dummy; -- cgit v1.2.1 From d606f88fa5e529e9dc72be97e79db1e36a6261cb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] i386: fix must_checks Fix __must_check warnings in i386/math-emu. Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen --- arch/i386/math-emu/fpu_emu.h | 1 + arch/i386/math-emu/fpu_entry.c | 3 ++- arch/i386/math-emu/fpu_system.h | 1 + arch/i386/math-emu/load_store.c | 2 ++ arch/i386/math-emu/reg_ld_str.c | 15 ++++++++++----- 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/math-emu/fpu_emu.h b/arch/i386/math-emu/fpu_emu.h index d62b20a3e660..65120f523853 100644 --- a/arch/i386/math-emu/fpu_emu.h +++ b/arch/i386/math-emu/fpu_emu.h @@ -57,6 +57,7 @@ #define TAG_Special Const(2) /* De-normal, + or - infinity, or Not a Number */ #define TAG_Empty Const(3) /* empty */ +#define TAG_Error Const(0x80) /* probably need to abort */ #define LOADED_DATA Const(10101) /* Special st() number to identify loaded data (not on stack). */ diff --git a/arch/i386/math-emu/fpu_entry.c b/arch/i386/math-emu/fpu_entry.c index d93f16ef828f..ddf8fa3bbd01 100644 --- a/arch/i386/math-emu/fpu_entry.c +++ b/arch/i386/math-emu/fpu_entry.c @@ -742,7 +742,8 @@ int save_i387_soft(void *s387, struct _fpstate __user * buf) S387->fcs &= ~0xf8000000; S387->fos |= 0xffff0000; #endif /* PECULIAR_486 */ - __copy_to_user(d, &S387->cwd, 7*4); + if (__copy_to_user(d, &S387->cwd, 7*4)) + return -1; RE_ENTRANT_CHECK_ON; d += 7*4; diff --git a/arch/i386/math-emu/fpu_system.h b/arch/i386/math-emu/fpu_system.h index bf26341c8bde..a3ae28c49ddd 100644 --- a/arch/i386/math-emu/fpu_system.h +++ b/arch/i386/math-emu/fpu_system.h @@ -68,6 +68,7 @@ #define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \ math_abort(FPU_info,SIGSEGV) +#define FPU_abort math_abort(FPU_info, SIGSEGV) #undef FPU_IGNORE_CODE_SEGV #ifdef FPU_IGNORE_CODE_SEGV diff --git a/arch/i386/math-emu/load_store.c b/arch/i386/math-emu/load_store.c index 85314be2fef8..eebd6fb1c8a8 100644 --- a/arch/i386/math-emu/load_store.c +++ b/arch/i386/math-emu/load_store.c @@ -227,6 +227,8 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes, case 027: /* fild m64int */ clear_C1(); loaded_tag = FPU_load_int64((long long __user *)data_address); + if (loaded_tag == TAG_Error) + return 0; FPU_settag0(loaded_tag); break; case 030: /* fstenv m14/28byte */ diff --git a/arch/i386/math-emu/reg_ld_str.c b/arch/i386/math-emu/reg_ld_str.c index f06ed41d191d..e976caef6498 100644 --- a/arch/i386/math-emu/reg_ld_str.c +++ b/arch/i386/math-emu/reg_ld_str.c @@ -244,7 +244,8 @@ int FPU_load_int64(long long __user *_s) RE_ENTRANT_CHECK_OFF; FPU_access_ok(VERIFY_READ, _s, 8); - copy_from_user(&s,_s,8); + if (copy_from_user(&s,_s,8)) + FPU_abort; RE_ENTRANT_CHECK_ON; if (s == 0) @@ -907,7 +908,8 @@ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) RE_ENTRANT_CHECK_OFF; FPU_access_ok(VERIFY_WRITE,d,8); - copy_to_user(d, &tll, 8); + if (copy_to_user(d, &tll, 8)) + FPU_abort; RE_ENTRANT_CHECK_ON; return 1; @@ -1336,7 +1338,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) I387.soft.fcs &= ~0xf8000000; I387.soft.fos |= 0xffff0000; #endif /* PECULIAR_486 */ - __copy_to_user(d, &control_word, 7*4); + if (__copy_to_user(d, &control_word, 7*4)) + FPU_abort; RE_ENTRANT_CHECK_ON; d += 0x1c; } @@ -1359,9 +1362,11 @@ void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) FPU_access_ok(VERIFY_WRITE,d,80); /* Copy all registers in stack order. */ - __copy_to_user(d, register_base+offset, other); + if (__copy_to_user(d, register_base+offset, other)) + FPU_abort; if ( offset ) - __copy_to_user(d+other, register_base, offset); + if (__copy_to_user(d+other, register_base, offset)) + FPU_abort; RE_ENTRANT_CHECK_ON; finit(); -- cgit v1.2.1 From a63954b5cad5765e52870bb649992bf636f32a6b Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Dec 2006 02:14:00 +0100 Subject: [PATCH] i386: remove pointless printk from i386 oops output This just got removed on x86-64, do the same on 32bit. It always annoyed me when this ate a line of oops output pushing interesting stuff off the screen. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 396041a46e3d..48ebfab661b7 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -777,7 +777,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg) printk(" on CPU%d, eip %08lx, registers:\n", smp_processor_id(), regs->eip); show_registers(regs); - printk(KERN_EMERG "console shuts up ...\n"); console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); -- cgit v1.2.1 From 42ed458aa51337357d7632c64aed4528f923e829 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: i386 add X86_FEATURE_PEBS and detection Here is a patch (used by perfmon2) to detect the presence of the Precise Event Based Sampling (PEBS) feature for i386. The patch also adds the cpu_has_pebs macro. - adds X86_FEATURE_PEBS - adds cpu_has_pebs to test for X86_FEATURE_PEBS Signed-off-by: stephane eranian Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/intel.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 94a95aa5227e..798c2f617e87 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -195,8 +195,14 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); -} + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<12))) + set_bit(X86_FEATURE_PEBS, c->x86_capability); + } +} static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) { -- cgit v1.2.1 From e5e3a0428968dcc1f9318ce1c941a918e99f8b84 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: remove default_ldt, and simplify ldt-setting. This patch removes the default_ldt[] array, as it has been unused since iBCS stopped being supported. This means it is now possible to actually set an empty LDT segment. In order to deal with this, the set_ldt_desc/load_LDT pair has been replaced with a single set_ldt() operation which is responsible for both setting up the LDT descriptor in the GDT, and reloading the LDT register. If there are no LDT entries, the LDT register is loaded with a NULL descriptor. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Acked-by: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/ldt.c | 4 +--- arch/i386/kernel/traps.c | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c index 445211eb2d57..b410e5fb034f 100644 --- a/arch/i386/kernel/ldt.c +++ b/arch/i386/kernel/ldt.c @@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount) { int err; unsigned long size; - void *address; err = 0; - address = &default_ldt[0]; size = 5*sizeof(struct desc_struct); if (size > bytecount) size = bytecount; err = size; - if (copy_to_user(ptr, address, size)) + if (clear_user(ptr, size)) err = -EFAULT; return err; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 48ebfab661b7..56655ea8d98f 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -61,9 +61,6 @@ int panic_on_unrecovered_nmi; asmlinkage int system_call(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; - /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; -- cgit v1.2.1 From bb81a09e55eaf7e5f798468ab971469b6f66a259 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] x86: all cpu backtrace When a spinlock lockup occurs, arrange for the NMI code to emit an all-cpu backtrace, so we get to see which CPU is holding the lock, and where. Cc: Andi Kleen Cc: Ingo Molnar Cc: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index eaafe233a5da..171194ccb7bc 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ int nmi_watchdog_enabled; static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); +static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ @@ -907,6 +910,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) touched = 1; } + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + sum = per_cpu(irq_stat, cpu).apic_timer_irqs; /* if the apic timer isn't firing, this cpu isn't doing much */ @@ -1033,6 +1046,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -- cgit v1.2.1 From be44d2aabce2d62f72d5751d1871b6212bf7a1c7 Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: espfix cleanup Clean up the espfix code: - Introduced PER_CPU() macro to be used from asm - Introduced GET_DESC_BASE() macro to be used from asm - Rewrote the fixup code in asm, as calling a C code with the altered %ss appeared to be unsafe - No longer altering the stack from a .fixup section - 16bit per-cpu stack is no longer used, instead the stack segment base is patched the way so that the high word of the kernel and user %esp are the same. - Added the limit-patching for the espfix segment. (Chuck Ebbert) [jeremy@goop.org: use the x86 scaling addressing mode rather than shifting] Signed-off-by: Stas Sergeev Signed-off-by: Andi Kleen Acked-by: Zachary Amsden Acked-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Jan Beulich Cc: Andi Kleen Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 5 +++ arch/i386/kernel/cpu/common.c | 11 ------- arch/i386/kernel/entry.S | 73 +++++++++++++++++++----------------------- arch/i386/kernel/head.S | 2 +- arch/i386/kernel/traps.c | 57 +++++++++------------------------ 5 files changed, 55 insertions(+), 93 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index c80271f8f084..e94d910a28bd 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -58,6 +58,11 @@ void foo(void) OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); + OFFSET(GDS_size, Xgt_desc_struct, size); + OFFSET(GDS_address, Xgt_desc_struct, address); + OFFSET(GDS_pad, Xgt_desc_struct, pad); + BLANK(); + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index d9f3e3c31f05..5532fc4e1bf0 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -24,9 +24,6 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); -DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); -EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); - static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -603,7 +600,6 @@ void __cpuinit cpu_init(void) struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = ¤t->thread; struct desc_struct *gdt; - __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu); struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); if (cpu_test_and_set(cpu, cpu_initialized)) { @@ -651,13 +647,6 @@ old_gdt: * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); - - /* Set up GDT entry for 16bit stack */ - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | - (CPU_16BIT_STACK_SIZE - 1); - cpu_gdt_descr->size = GDT_SIZE - 1; cpu_gdt_descr->address = (unsigned long)gdt; diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5a63d6fdb70e..c38d801ba0bb 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -48,6 +48,7 @@ #include #include #include +#include #include #include "irq_vectors.h" @@ -418,23 +419,18 @@ ldt_ss: * This is an "official" bug of all the x86-compatible * CPUs, which we can try to work around to make * dosemu and wine happy. */ - subl $8, %esp # reserve space for switch16 pointer - CFI_ADJUST_CFA_OFFSET 8 + movl OLDESP(%esp), %eax + movl %esp, %edx + call patch_espfix_desc + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 DISABLE_INTERRUPTS TRACE_IRQS_OFF - movl %esp, %eax - /* Set up the 16bit stack frame with switch32 pointer on top, - * and a switch16 pointer on top of the current frame. */ - call setup_x86_bogus_stack - CFI_ADJUST_CFA_OFFSET -8 # frame has moved - TRACE_IRQS_IRET - RESTORE_REGS - lss 20+4(%esp), %esp # switch to 16bit stack -1: INTERRUPT_RETURN -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous + lss (%esp), %esp + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck CFI_ENDPROC # perform work that needs to be done immediately before resumption @@ -524,30 +520,30 @@ syscall_badsys: CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ - movl %esp, %eax; \ - /* switch to 32bit stack using the pointer on top of 16bit stack */ \ - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ - /* copy data from 16bit stack to 32bit stack */ \ - call fixup_x86_bogus_stack; \ - /* put ESP to the proper location */ \ - movl %eax, %esp; -#define UNWIND_ESPFIX_STACK \ + /* since we are on a wrong stack, we cant make it a C code :( */ \ + GET_THREAD_INFO(%ebp); \ + movl TI_cpu(%ebp), %ebx; \ + PER_CPU(cpu_gdt_descr, %ebx); \ + movl GDS_address(%ebx), %ebx; \ + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ + addl %esp, %eax; \ + pushl $__KERNEL_DS; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl %eax; \ CFI_ADJUST_CFA_OFFSET 4; \ + lss (%esp), %esp; \ + CFI_ADJUST_CFA_OFFSET -8; +#define UNWIND_ESPFIX_STACK \ movl %ss, %eax; \ - /* see if on 16bit stack */ \ + /* see if on espfix stack */ \ cmpw $__ESPFIX_SS, %ax; \ - je 28f; \ -27: popl %eax; \ - CFI_ADJUST_CFA_OFFSET -4; \ -.section .fixup,"ax"; \ -28: movl $__KERNEL_DS, %eax; \ + jne 27f; \ + movl $__KERNEL_DS, %eax; \ movl %eax, %ds; \ movl %eax, %es; \ - /* switch to 32bit stack */ \ + /* switch to normal stack */ \ FIXUP_ESPFIX_STACK; \ - jmp 27b; \ -.previous +27:; /* * Build the entry stubs and pointer table with @@ -614,7 +610,6 @@ error_code: pushl %eax CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eax, 0 - xorl %eax, %eax pushl %ebp CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebp, 0 @@ -627,7 +622,6 @@ error_code: pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - decl %eax # eax = -1 pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 @@ -644,7 +638,7 @@ error_code: /*CFI_REGISTER es, ecx*/ movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code - movl %eax, ORIG_EAX(%esp) + movl $-1, ORIG_EAX(%esp) movl %ecx, ES(%esp) /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx @@ -754,7 +748,7 @@ KPROBE_ENTRY(nmi) cmpw $__ESPFIX_SS, %ax popl %eax CFI_ADJUST_CFA_OFFSET -4 - je nmi_16bit_stack + je nmi_espfix_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax @@ -797,7 +791,7 @@ nmi_debug_stack_check: FIX_STACK(24,nmi_stack_correct, 1) jmp nmi_stack_correct -nmi_16bit_stack: +nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * * create the pointer to lss back @@ -806,7 +800,6 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 pushl %esp CFI_ADJUST_CFA_OFFSET 4 - movzwl %sp, %esp addw $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 @@ -817,11 +810,11 @@ nmi_16bit_stack: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp - CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS - lss 12+4(%esp), %esp # back to 16bit stack + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 1: INTERRUPT_RETURN CFI_ENDPROC .section __ex_table,"a" diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index ca31f18d277c..b1f1df11fcc6 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -584,7 +584,7 @@ ENTRY(cpu_gdt_table) .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ .quad 0x004092000000ffff /* 0xc8 APM DS data */ - .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ .quad 0x0000000000000000 /* 0xd8 - unused */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 56655ea8d98f..f9bb1f89d687 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1088,49 +1088,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } -fastcall void setup_x86_bogus_stack(unsigned char * stk) +fastcall unsigned long patch_espfix_desc(unsigned long uesp, + unsigned long kesp) { - unsigned long *switch16_ptr, *switch32_ptr; - struct pt_regs *regs; - unsigned long stack_top, stack_bot; - unsigned short iret_frame16_off; int cpu = smp_processor_id(); - /* reserve the space on 32bit stack for the magic switch16 pointer */ - memmove(stk, stk + 8, sizeof(struct pt_regs)); - switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); - regs = (struct pt_regs *)stk; - /* now the switch32 on 16bit stack */ - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; - /* copy iret frame on 16bit stack */ - memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); - /* fill in the switch pointers */ - switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; - switch16_ptr[1] = __ESPFIX_SS; - switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + - 8 - CPU_16BIT_STACK_SIZE; - switch32_ptr[1] = __KERNEL_DS; -} - -fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) -{ - unsigned long *switch32_ptr; - unsigned char *stack16, *stack32; - unsigned long stack_top, stack_bot; - int len; - int cpu = smp_processor_id(); - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); - stack_top = stack_bot + CPU_16BIT_STACK_SIZE; - switch32_ptr = (unsigned long *)(stack_top - 8); - /* copy the data from 16bit stack to 32bit stack */ - len = CPU_16BIT_STACK_SIZE - 8 - sp; - stack16 = (unsigned char *)(stack_bot + sp); - stack32 = (unsigned char *) - (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); - memcpy(stack32, stack16, len); - return stack32; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + return new_kesp; } /* -- cgit v1.2.1 From acc207616a91a413a50fdd8847a747c4a7324167 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:01 +0100 Subject: [PATCH] i386: add sleazy FPU optimization i386 port of the sLeAZY-fpu feature. Chuck reports that this gives him a +/- 0.4% improvement on his simple benchmark x86_64 description follows: Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every* context switch a trap is taken for the first FPU use to restore the FPU context lazily. This is of course great for applications that have very sporadic or no FPU use (since then you avoid doing the expensive save/restore all the time). However for very frequent FPU users... you take an extra trap every context switch. The patch below adds a simple heuristic to this code: After 5 consecutive context switches of FPU use, the lazy behavior is disabled and the context gets restored every context switch. If the app indeed uses the FPU, the trap is avoided. (the chance of the 6th time slice using FPU after the previous 5 having done so are quite high obviously). After 256 switches, this is reset and lazy behavior is returned (until there are 5 consecutive ones again). The reason for this is to give apps that do longer bursts of FPU use still the lazy behavior back after some time. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 12 ++++++++++++ arch/i386/kernel/traps.c | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index dd53c58f64f1..ae924c416b68 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas __unlazy_fpu(prev_p); + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) + prefetch(&next->i387.fxsave); + /* * Reload esp0. */ @@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas disable_tsc(prev_p, next_p); + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter > 5) + math_state_restore(); + return prev_p; } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index f9bb1f89d687..4a6fa2837df2 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp, * Must be called with kernel preemption disabled (in this case, * local interrupts are disabled at the call-site in entry.S). */ -asmlinkage void math_state_restore(struct pt_regs regs) +asmlinkage void math_state_restore(void) { struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; @@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs) init_fpu(tsk); restore_fpu(tsk); thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } #ifndef CONFIG_MATH_EMULATION -- cgit v1.2.1 From c0e84b9901c0924e2503c0aab3772a4469ba4aef Mon Sep 17 00:00:00 2001 From: Amol Lad Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Add iounmap in error paths in hpet code Signed-off-by: Amol Lad Signed-off-by: Andi Kleen --- arch/i386/kernel/time_hpet.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c index 1a2a979cf6a3..1e4702dfcd01 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/i386/kernel/time_hpet.c @@ -132,14 +132,20 @@ int __init hpet_enable(void) * the single HPET timer for system time. */ #ifdef CONFIG_HPET_EMULATE_RTC - if (!(id & HPET_ID_NUMBER)) + if (!(id & HPET_ID_NUMBER)) { + iounmap(hpet_virt_address); + hpet_virt_address = NULL; return -1; + } #endif hpet_period = hpet_readl(HPET_PERIOD); - if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) + if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) { + iounmap(hpet_virt_address); + hpet_virt_address = NULL; return -1; + } /* * 64 bit math @@ -156,8 +162,11 @@ int __init hpet_enable(void) hpet_use_timer = id & HPET_ID_LEGSUP; - if (hpet_timer_stop_set_go(hpet_tick)) + if (hpet_timer_stop_set_go(hpet_tick)) { + iounmap(hpet_virt_address); + hpet_virt_address = NULL; return -1; + } use_hpet = 1; -- cgit v1.2.1 From fa5cecd111d235819a1d807d43216ae459a0dd6f Mon Sep 17 00:00:00 2001 From: Amol Lad Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: add missing iounmap in i386 hpet clocksource code ioremap must be balanced by an iounmap and failing to do so can result in a memory leak. Tested (compilation only): - using allmodconfig - making sure the files are compiling without any warning/error due to new changes Signed-off-by: Amol Lad Signed-off-by: Andi Kleen --- arch/i386/kernel/hpet.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c index 17647a530b2f..45a8685bb60b 100644 --- a/arch/i386/kernel/hpet.c +++ b/arch/i386/kernel/hpet.c @@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void) unsigned long hpet_period; void __iomem* hpet_base; u64 tmp; + int err; if (!is_hpet_enabled()) return -ENODEV; @@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void) do_div(tmp, FSEC_PER_NSEC); clocksource_hpet.mult = (u32)tmp; - return clocksource_register(&clocksource_hpet); + err = clocksource_register(&clocksource_hpet); + if (err) + iounmap(hpet_base); + + return err; } module_init(init_hpet_clocksource); -- cgit v1.2.1 From eb5b7b9d86f46b45ba1f986302fdf7df84fb8297 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Use asm-offsets for the offsets of registers into the pt_regs struct Use asm-offsets for the offsets of registers into the pt_regs struct, rather than having hard-coded constants I left the constants in the comments of entry.S because they're useful for reference; the code in entry.S is very dependent on the layout of pt_regs, even when using asm-offsets. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Keith Owens Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 17 ++++++ arch/i386/kernel/entry.S | 120 ++++++++++++++++++----------------------- 2 files changed, 69 insertions(+), 68 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index e94d910a28bd..70b19807acf9 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -63,6 +63,23 @@ void foo(void) OFFSET(GDS_pad, Xgt_desc_struct, pad); BLANK(); + OFFSET(PT_EBX, pt_regs, ebx); + OFFSET(PT_ECX, pt_regs, ecx); + OFFSET(PT_EDX, pt_regs, edx); + OFFSET(PT_ESI, pt_regs, esi); + OFFSET(PT_EDI, pt_regs, edi); + OFFSET(PT_EBP, pt_regs, ebp); + OFFSET(PT_EAX, pt_regs, eax); + OFFSET(PT_DS, pt_regs, xds); + OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); + OFFSET(PT_EIP, pt_regs, eip); + OFFSET(PT_CS, pt_regs, xcs); + OFFSET(PT_EFLAGS, pt_regs, eflags); + OFFSET(PT_OLDESP, pt_regs, esp); + OFFSET(PT_OLDSS, pt_regs, xss); + BLANK(); + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index c38d801ba0bb..0069bf01603e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -54,22 +54,6 @@ #define nr_syscalls ((syscall_table_size)/4) -EBX = 0x00 -ECX = 0x04 -EDX = 0x08 -ESI = 0x0C -EDI = 0x10 -EBP = 0x14 -EAX = 0x18 -DS = 0x1C -ES = 0x20 -ORIG_EAX = 0x24 -EIP = 0x28 -CS = 0x2C -EFLAGS = 0x30 -OLDESP = 0x34 -OLDSS = 0x38 - CF_MASK = 0x00000001 TF_MASK = 0x00000100 IF_MASK = 0x00000200 @@ -93,7 +77,7 @@ VM_MASK = 0x00020000 .macro TRACE_IRQS_IRET #ifdef CONFIG_TRACE_IRQFLAGS - testl $IF_MASK,EFLAGS(%esp) # interrupts off? + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? jz 1f TRACE_IRQS_ON 1: @@ -199,18 +183,18 @@ VM_MASK = 0x00020000 #define RING0_PTREGS_FRAME \ CFI_STARTPROC simple;\ CFI_SIGNAL_FRAME;\ - CFI_DEF_CFA esp, OLDESP-EBX;\ - /*CFI_OFFSET cs, CS-OLDESP;*/\ - CFI_OFFSET eip, EIP-OLDESP;\ - /*CFI_OFFSET es, ES-OLDESP;*/\ - /*CFI_OFFSET ds, DS-OLDESP;*/\ - CFI_OFFSET eax, EAX-OLDESP;\ - CFI_OFFSET ebp, EBP-OLDESP;\ - CFI_OFFSET edi, EDI-OLDESP;\ - CFI_OFFSET esi, ESI-OLDESP;\ - CFI_OFFSET edx, EDX-OLDESP;\ - CFI_OFFSET ecx, ECX-OLDESP;\ - CFI_OFFSET ebx, EBX-OLDESP + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ + CFI_OFFSET ebx, PT_EBX-PT_OLDESP ENTRY(ret_from_fork) CFI_STARTPROC @@ -242,8 +226,8 @@ ret_from_exception: ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: - movl EFLAGS(%esp), %eax # mix EFLAGS and CS - movb CS(%esp), %al + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al andl $(VM_MASK | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace @@ -266,7 +250,7 @@ need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all - testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq jmp need_resched @@ -332,15 +316,15 @@ sysenter_past_esp: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) + movl %eax,PT_EAX(%esp) DISABLE_INTERRUPTS TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work /* if something modifies registers it must also disable sysexit */ - movl EIP(%esp), %edx - movl OLDESP(%esp), %ecx + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT @@ -354,7 +338,7 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testl $TF_MASK,EFLAGS(%esp) + testl $TF_MASK,PT_EFLAGS(%esp) jz no_singlestep orl $_TIF_SINGLESTEP,TI_flags(%ebp) no_singlestep: @@ -366,7 +350,7 @@ no_singlestep: jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) # store the return value + movl %eax,PT_EAX(%esp) # store the return value syscall_exit: DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -377,12 +361,12 @@ syscall_exit: jne syscall_exit_work restore_all: - movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: OLDSS(%esp) contains the wrong/random values if we + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. # See comments in process.c:copy_thread() for details. - movb OLDSS(%esp), %ah - movb CS(%esp), %al + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE @@ -409,7 +393,7 @@ iret_exc: CFI_RESTORE_STATE ldt_ss: - larl OLDSS(%esp), %eax + larl PT_OLDSS(%esp), %eax jnz restore_nocheck testl $0x00400000, %eax # returning to 32bit stack? jnz restore_nocheck # allright, normal return @@ -419,7 +403,7 @@ ldt_ss: * This is an "official" bug of all the x86-compatible * CPUs, which we can try to work around to make * dosemu and wine happy. */ - movl OLDESP(%esp), %eax + movl PT_OLDESP(%esp), %eax movl %esp, %edx call patch_espfix_desc pushl $__ESPFIX_SS @@ -454,7 +438,7 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests - testl $VM_MASK, EFLAGS(%esp) + testl $VM_MASK, PT_EFLAGS(%esp) movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space @@ -479,14 +463,14 @@ work_notifysig_v86: # perform syscall exit tracing ALIGN syscall_trace_entry: - movl $-ENOSYS,EAX(%esp) + movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax xorl %edx,%edx call do_syscall_trace cmpl $0, %eax jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, # so must skip actual syscall - movl ORIG_EAX(%esp), %eax + movl PT_ORIG_EAX(%esp), %eax cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -511,11 +495,11 @@ syscall_fault: CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - movl $-EFAULT,EAX(%esp) + movl $-EFAULT,PT_EAX(%esp) jmp resume_userspace syscall_badsys: - movl $-ENOSYS,EAX(%esp) + movl $-ENOSYS,PT_EAX(%esp) jmp resume_userspace CFI_ENDPROC @@ -636,10 +620,10 @@ error_code: popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl ES(%esp), %edi # get the function address - movl ORIG_EAX(%esp), %edx # get the error code - movl $-1, ORIG_EAX(%esp) - movl %ecx, ES(%esp) + movl PT_ES(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) + movl %ecx, PT_ES(%esp) /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds @@ -942,26 +926,26 @@ ENTRY(arch_unwind_init_running) movl 4(%esp), %edx movl (%esp), %ecx leal 4(%esp), %eax - movl %ebx, EBX(%edx) + movl %ebx, PT_EBX(%edx) xorl %ebx, %ebx - movl %ebx, ECX(%edx) - movl %ebx, EDX(%edx) - movl %esi, ESI(%edx) - movl %edi, EDI(%edx) - movl %ebp, EBP(%edx) - movl %ebx, EAX(%edx) - movl $__USER_DS, DS(%edx) - movl $__USER_DS, ES(%edx) - movl %ebx, ORIG_EAX(%edx) - movl %ecx, EIP(%edx) + movl %ebx, PT_ECX(%edx) + movl %ebx, PT_EDX(%edx) + movl %esi, PT_ESI(%edx) + movl %edi, PT_EDI(%edx) + movl %ebp, PT_EBP(%edx) + movl %ebx, PT_EAX(%edx) + movl $__USER_DS, PT_DS(%edx) + movl $__USER_DS, PT_ES(%edx) + movl %ebx, PT_ORIG_EAX(%edx) + movl %ecx, PT_EIP(%edx) movl 12(%esp), %ecx - movl $__KERNEL_CS, CS(%edx) - movl %ebx, EFLAGS(%edx) - movl %eax, OLDESP(%edx) + movl $__KERNEL_CS, PT_CS(%edx) + movl %ebx, PT_EFLAGS(%edx) + movl %eax, PT_OLDESP(%edx) movl 8(%esp), %eax movl %ecx, 8(%esp) - movl EBX(%edx), %ebx - movl $__KERNEL_DS, OLDSS(%edx) + movl PT_EBX(%edx), %ebx + movl $__KERNEL_DS, PT_OLDSS(%edx) jmpl *%eax CFI_ENDPROC ENDPROC(arch_unwind_init_running) -- cgit v1.2.1 From 9ca36101a8d74704d78f10910f89d62de96f9dc8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Basic definitions for i386-pda This patch has the basic definitions of struct i386_pda, and the segment selector in the GDT. asm-i386/pda.h is more or less a direct copy of asm-x86_64/pda.h. The most interesting difference is the use of _proxy_pda, which is used to give gcc a model for the actual memory operations on the real pda structure. No actual reference is ever made to _proxy_pda, so it is never defined. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/head.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index b1f1df11fcc6..4a83384c5a61 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -585,7 +585,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ -- cgit v1.2.1 From 62111195800d80c66cdc69063ea3145878c99fbf Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Initialize the per-CPU data area When a CPU is brought up, a PDA and GDT are allocated for it. The GDT's __KERNEL_PDA entry is pointed to the allocated PDA memory, so that all references using this segment descriptor will refer to the PDA. This patch rearranges CPU initialization a bit, so that the GDT/PDA are set up as early as possible in cpu_init(). Also for secondary CPUs, GDT+PDA are preallocated and initialized so all the secondary CPU needs to do is set up the ldt and load %gs. This will be important once smp_processor_id() and current use the PDA. In all cases, the PDA is set up in head.S, before a CPU starts running C code, so the PDA is always available. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Cc: James Bottomley Cc: Matt Tolentino Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 177 ++++++++++++++++++++++++++--------- arch/i386/kernel/smpboot.c | 28 ++++-- arch/i386/mach-voyager/voyager_smp.c | 14 ++- 3 files changed, 169 insertions(+), 50 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 5532fc4e1bf0..2534e25ed745 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -18,12 +18,16 @@ #include #include #endif +#include #include "cpu.h" DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); +struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + static int cachesize_override __cpuinitdata = -1; static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -588,41 +592,16 @@ void __init early_cpu_init(void) disable_pse = 1; #endif } -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __cpuinit cpu_init(void) + +__cpuinit int alloc_gdt(int cpu) { - int cpu = smp_processor_id(); - struct tss_struct * t = &per_cpu(init_tss, cpu); - struct thread_struct *thread = ¤t->thread; - struct desc_struct *gdt; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; - if (cpu_test_and_set(cpu, cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); - } - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); - if (cpu_has_vme || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } - - /* The CPU hotplug case */ - if (cpu_gdt_descr->address) { - gdt = (struct desc_struct *)cpu_gdt_descr->address; - memset(gdt, 0, PAGE_SIZE); - goto old_gdt; - } /* * This is a horrible hack to allocate the GDT. The problem * is that cpu_init() is called really early for the boot CPU @@ -630,36 +609,117 @@ void __cpuinit cpu_init(void) * CPUs, when bootmem will have gone away */ if (NODE_DATA(0)->bdata->node_bootmem_map) { - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); - /* alloc_bootmem_pages panics on failure, so no check */ + BUG_ON(gdt != NULL || pda != NULL); + + gdt = alloc_bootmem_pages(PAGE_SIZE); + pda = alloc_bootmem(sizeof(*pda)); + /* alloc_bootmem(_pages) panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + memset(pda, 0, sizeof(*pda)); } else { - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); - if (unlikely(!gdt)) { - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); - for (;;) - local_irq_enable(); + /* GDT and PDA might already have been allocated if + this is a CPU hotplug re-insertion. */ + if (gdt == NULL) + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + + if (pda == NULL) + pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); + + if (unlikely(!gdt || !pda)) { + free_pages((unsigned long)gdt, 0); + kfree(pda); + return 0; } } -old_gdt: + + cpu_gdt_descr->address = (unsigned long)gdt; + cpu_pda(cpu) = pda; + + return 1; +} + +/* Initial PDA used by boot CPU */ +struct i386_pda boot_pda = { + ._pda = &boot_pda, +}; + +/* Initialize the CPU's GDT and PDA. The boot CPU does this for + itself, but secondaries find this done for them. */ +__cpuinit int init_gdt(int cpu, struct task_struct *idle) +{ + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + struct desc_struct *gdt; + struct i386_pda *pda; + + /* For non-boot CPUs, the GDT and PDA should already have been + allocated. */ + if (!alloc_gdt(cpu)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); + return 0; + } + + gdt = (struct desc_struct *)cpu_gdt_descr->address; + pda = cpu_pda(cpu); + + BUG_ON(gdt == NULL || pda == NULL); + /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ memcpy(gdt, cpu_gdt_table, GDT_SIZE); cpu_gdt_descr->size = GDT_SIZE - 1; - cpu_gdt_descr->address = (unsigned long)gdt; + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, + (u32 *)&gdt[GDT_ENTRY_PDA].b, + (unsigned long)pda, sizeof(*pda) - 1, + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ + + memset(pda, 0, sizeof(*pda)); + pda->_pda = pda; + + return 1; +} + +/* Common CPU init for both boot and secondary CPUs */ +static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) +{ + struct tss_struct * t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = &curr->thread; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + /* Reinit these anyway, even if they've already been done (on + the boot CPU, this will transition from the boot gdt+pda to + the real ones). */ load_gdt(cpu_gdt_descr); + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + load_idt(&idt_descr); /* * Set up and load the per-CPU TSS and LDT */ atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - BUG_ON(current->mm); - enter_lazy_tlb(&init_mm, current); + curr->active_mm = &init_mm; + if (curr->mm) + BUG(); + enter_lazy_tlb(&init_mm, curr); load_esp0(t, thread); set_tss_desc(cpu,t); @@ -690,6 +750,37 @@ old_gdt: mxcsr_feature_mask_init(); } +/* Entrypoint to initialize secondary CPU */ +void __cpuinit secondary_cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + _cpu_init(cpu, curr); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + + /* Set up the real GDT and PDA, so we can transition from the + boot versions. */ + if (!init_gdt(cpu, curr)) { + /* failed to allocate something; not much we can do... */ + for (;;) + local_irq_enable(); + } + + _cpu_init(cpu, curr); +} + #ifdef CONFIG_HOTPLUG_CPU void __cpuinit cpu_uninit(void) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4bb8b77cd65b..095636620fa2 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -536,11 +537,11 @@ set_cpu_sibling_map(int cpu) static void __devinit start_secondary(void *unused) { /* - * Dont put anything before smp_callin(), SMP + * Don't put *anything* before secondary_cpu_init(), SMP * booting is too fragile that we want to limit the * things done here to the most necessary things. */ - cpu_init(); + secondary_cpu_init(); preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) @@ -599,13 +600,16 @@ void __devinit initialize_secondary(void) "movl %0,%%esp\n\t" "jmp *%1" : - :"r" (current->thread.esp),"r" (current->thread.eip)); + :"m" (current->thread.esp),"m" (current->thread.eip)); } +/* Static state in head.S used to set up a CPU */ extern struct { void * esp; unsigned short ss; } stack_start; +extern struct i386_pda *start_pda; +extern struct Xgt_desc_struct cpu_gdt_descr; #ifdef CONFIG_NUMA @@ -936,9 +940,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu) unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - ++cpucount; - alternatives_smp_switch(1); - /* * We can't use kernel_thread since we must avoid to * reschedule the child. @@ -946,15 +947,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu) idle = alloc_idle_task(cpu); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + + /* Pre-allocate and initialize the CPU's GDT and PDA so it + doesn't have to do any memory allocation during the + delicate CPU-bringup phase. */ + if (!init_gdt(cpu, idle)) { + printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); + return -1; /* ? */ + } + idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); + ++cpucount; + alternatives_smp_switch(1); + /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); /* Stack for startup_32 can be just as for start_secondary onwards */ stack_start.esp = (void *) idle->thread.esp; + start_pda = cpu_pda(cpu); + cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu); + irq_ctx_init(cpu); x86_cpu_to_apicid[cpu] = apicid; diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c index f3fea2ad50fe..55428e656a3f 100644 --- a/arch/i386/mach-voyager/voyager_smp.c +++ b/arch/i386/mach-voyager/voyager_smp.c @@ -28,6 +28,7 @@ #include #include #include +#include /* TLB state -- visible externally, indexed physically */ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0 }; @@ -422,6 +423,7 @@ find_smp_config(void) VOYAGER_SUS_IN_CONTROL_PORT); current_thread_info()->cpu = boot_cpu_id; + write_pda(cpu_number, boot_cpu_id); } /* @@ -458,7 +460,7 @@ start_secondary(void *unused) /* external functions not defined in the headers */ extern void calibrate_delay(void); - cpu_init(); + secondary_cpu_init(); /* OK, we're in the routine */ ack_CPI(VIC_CPU_BOOT_CPI); @@ -578,6 +580,15 @@ do_boot_cpu(__u8 cpu) /* init_tasks (in sched.c) is indexed logically */ stack_start.esp = (void *) idle->thread.esp; + /* Pre-allocate and initialize the CPU's GDT and PDA so it + doesn't have to do any memory allocation during the + delicate CPU-bringup phase. */ + if (!init_gdt(cpu, idle)) { + printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu); + cpucount--; + return; + } + irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ @@ -1963,4 +1974,5 @@ void __init smp_setup_processor_id(void) { current_thread_info()->cpu = hard_smp_processor_id(); + write_pda(cpu_number, hard_smp_processor_id()); } -- cgit v1.2.1 From f95d47caae5302a63d92be9a0292abc90e2a14e1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Use %gs as the PDA base-segment in the kernel This patch is the meat of the PDA change. This patch makes several related changes: 1: Most significantly, %gs is now used in the kernel. This means that on entry, the old value of %gs is saved away, and it is reloaded with __KERNEL_PDA. 2: entry.S constructs the stack in the shape of struct pt_regs, and this is passed around the kernel so that the process's saved register state can be accessed. Unfortunately struct pt_regs doesn't currently have space for %gs (or %fs). This patch extends pt_regs to add space for gs (no space is allocated for %fs, since it won't be used, and it would just complicate the code in entry.S to work around the space). 3: Because %gs is now saved on the stack like %ds, %es and the integer registers, there are a number of places where it no longer needs to be handled specially; namely context switch, and saving/restoring the register state in a signal context. 4: And since kernel threads run in kernel space and call normal kernel code, they need to be created with their %gs == __KERNEL_PDA. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 1 + arch/i386/kernel/cpu/common.c | 21 +++++++++++-- arch/i386/kernel/entry.S | 70 +++++++++++++++++++++++++++++------------- arch/i386/kernel/head.S | 31 ++++++++++++++++--- arch/i386/kernel/process.c | 26 ++++++++-------- arch/i386/kernel/signal.c | 6 ++-- 6 files changed, 109 insertions(+), 46 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 70b19807acf9..9620872d3534 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -72,6 +72,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_GS, pt_regs, xgs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 2534e25ed745..4e63d8ce602b 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -593,6 +593,14 @@ void __init early_cpu_init(void) #endif } +/* Make sure %gs is initialized properly in idle threads */ +struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + regs->xgs = __KERNEL_PDA; + return regs; +} + __cpuinit int alloc_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); @@ -644,6 +652,14 @@ struct i386_pda boot_pda = { ._pda = &boot_pda, }; +static inline void set_kernel_gs(void) +{ + /* Set %gs for this CPU's PDA. Memory clobber is to create a + barrier with respect to any PDA operations, so the compiler + doesn't move any before here. */ + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); +} + /* Initialize the CPU's GDT and PDA. The boot CPU does this for itself, but secondaries find this done for them. */ __cpuinit int init_gdt(int cpu, struct task_struct *idle) @@ -693,6 +709,7 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); + set_kernel_gs(); if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -731,8 +748,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs and %gs. */ - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); + /* Clear %fs. */ + asm volatile ("mov %0, %%fs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0069bf01603e..b99d4a160078 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -30,12 +30,13 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss + * 24(%esp) - %gs + * 28(%esp) - orig_eax + * 2C(%esp) - %eip + * 30(%esp) - %cs + * 34(%esp) - %eflags + * 38(%esp) - %oldesp + * 3C(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -92,6 +93,9 @@ VM_MASK = 0x00020000 #define SAVE_ALL \ cld; \ + pushl %gs; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET gs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -121,7 +125,9 @@ VM_MASK = 0x00020000 CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; + movl %edx, %es; \ + movl $(__KERNEL_PDA), %edx; \ + movl %edx, %gs #define RESTORE_INT_REGS \ popl %ebx; \ @@ -154,17 +160,22 @@ VM_MASK = 0x00020000 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ +3: popl %gs; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE gs;*/\ +.pushsection .fixup,"ax"; \ 4: movl $0,(%esp); \ + jmp 1b; \ +5: movl $0,(%esp); \ jmp 2b; \ -.previous; \ +6: movl $0,(%esp); \ + jmp 3b; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ -.previous + .long 1b,4b; \ + .long 2b,5b; \ + .long 3b,6b; \ +.popsection #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ @@ -231,6 +242,7 @@ check_userspace: andl $(VM_MASK | SEGMENT_RPL_MASK), %eax cmpl $USER_RPL, %eax jb resume_kernel # not returning to v8086 or userspace + ENTRY(resume_userspace) DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending @@ -327,9 +339,16 @@ sysenter_past_esp: movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp TRACE_IRQS_ON +1: mov PT_GS(%esp), %gs ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC - +.pushsection .fixup,"ax" +2: movl $0,PT_GS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection # system call handler stub ENTRY(system_call) @@ -375,7 +394,7 @@ restore_nocheck: TRACE_IRQS_IRET restore_nocheck_notrace: RESTORE_REGS - addl $4, %esp + addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 1: INTERRUPT_RETURN .section .fixup,"ax" @@ -588,6 +607,10 @@ KPROBE_ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: + /* the function address is in %gs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ pushl %ds CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ds, 0*/ @@ -613,18 +636,20 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %es + pushl %gs CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ + /*CFI_REL_OFFSET gs, 0*/ + movl $(__KERNEL_PDA), %ecx + movl %ecx, %gs UNWIND_ESPFIX_STACK popl %ecx CFI_ADJUST_CFA_OFFSET -4 /*CFI_REGISTER es, ecx*/ - movl PT_ES(%esp), %edi # get the function address + movl PT_GS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code - movl $-1, PT_ORIG_EAX(%esp) - movl %ecx, PT_ES(%esp) - /*CFI_REL_OFFSET es, ES*/ + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_GS(%esp) + /*CFI_REL_OFFSET gs, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es @@ -936,6 +961,7 @@ ENTRY(arch_unwind_init_running) movl %ebx, PT_EAX(%edx) movl $__USER_DS, PT_DS(%edx) movl $__USER_DS, PT_ES(%edx) + movl $0, PT_GS(%edx) movl %ebx, PT_ORIG_EAX(%edx) movl %ecx, PT_EIP(%edx) movl 12(%esp), %ecx diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 4a83384c5a61..5b14e95ac8b9 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -302,6 +302,7 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 + call setup_pda lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -312,10 +313,13 @@ is386: movl $2,%ecx # set MP movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS/GS and LDT + xorl %eax,%eax # Clear FS and LDT movl %eax,%fs - movl %eax,%gs lldt %ax + + movl $(__KERNEL_PDA),%eax + mov %eax,%gs + cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder #ifdef CONFIG_SMP @@ -345,6 +349,23 @@ check_x87: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret +/* + * Point the GDT at this CPU's PDA. On boot this will be + * cpu_gdt_table and boot_pda; for secondary CPUs, these will be + * that CPU's GDT and PDA. + */ +setup_pda: + /* get the PDA pointer */ + movl start_pda, %eax + + /* slot the PDA address into the GDT */ + mov cpu_gdt_descr+2, %ecx + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ + shr $16, %eax + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ + ret + /* * setup_idt * @@ -484,6 +505,8 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data +ENTRY(start_pda) + .long boot_pda ENTRY(stack_start) .long init_thread_union+THREAD_SIZE @@ -525,7 +548,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -cpu_gdt_descr: +ENTRY(cpu_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table @@ -585,7 +608,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */ - .quad 0x0000000000000000 /* 0xd8 - PDA */ + .quad 0x00cf92000000ffff /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index ae924c416b68..905364d42847 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -56,6 +56,7 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -346,6 +347,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xds = __USER_DS; regs.xes = __USER_DS; + regs.xgs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -431,7 +433,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -659,16 +660,16 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_esp0(tss, next); /* - * Save away %fs and %gs. No need to save %es and %ds, as - * those are always kernel segments while inside the kernel. - * Doing this before setting the new TLS descriptors avoids - * the situation where we temporarily have non-reloadable - * segments in %fs and %gs. This could be an issue if the - * NMI handler ever used %fs or %gs (it does not today), or - * if the kernel is running inside of a hypervisor layer. + * Save away %fs. No need to save %gs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. */ savesegment(fs, prev->fs); - savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -676,16 +677,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas load_TLS(next, cpu); /* - * Restore %fs and %gs if needed. + * Restore %fs if needed. * - * Glibc normally makes %fs be zero, and %gs is one of - * the TLS segments. + * Glibc normally makes %fs be zero. */ if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); - if (prev->gs | next->gs) - loadsegment(gs, next->gs); /* * Restore IOPL if needed. diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 43002cfb40c4..65d7620eaa09 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - GET_SEG(gs); + COPY_SEG(gs); GET_SEG(fs); COPY_SEG(es); COPY_SEG(ds); @@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - tmp = 0; - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); -- cgit v1.2.1 From 66e10a44d724f1464b5e8b5a3eae1e2cbbc2cca6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:02 +0100 Subject: [PATCH] i386: Fix places where using %gs changes the usermode ABI There are a few places where the change in struct pt_regs and the use of %gs affect the userspace ABI. These are primarily debugging interfaces where thread state can be inspected or extracted. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/process.c | 6 +++--- arch/i386/kernel/ptrace.c | 18 ++++++------------ 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 905364d42847..dc4272545179 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -315,8 +315,8 @@ void show_regs(struct pt_regs * regs) regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); + printk(" DS: %04x ES: %04x GS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -509,7 +509,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->regs.ds = regs->xds; dump->regs.es = regs->xes; savesegment(fs,dump->regs.fs); - savesegment(gs,dump->regs.gs); + dump->regs.gs = regs->xgs; dump->regs.orig_eax = regs->orig_eax; dump->regs.eip = regs->eip; dump->regs.cs = regs->xcs; diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index 775f50e9395b..f3f94ac5736a 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -94,13 +94,9 @@ static int putreg(struct task_struct *child, return -EIO; child->thread.fs = value; return 0; - case GS: - if (value && (value & 3) != 3) - return -EIO; - child->thread.gs = value; - return 0; case DS: case ES: + case GS: if (value && (value & 3) != 3) return -EIO; value &= 0xffff; @@ -116,8 +112,8 @@ static int putreg(struct task_struct *child, value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; break; } - if (regno > GS*4) - regno -= 2*4; + if (regno > ES*4) + regno -= 1*4; put_stack_long(child, regno - sizeof(struct pt_regs), value); return 0; } @@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child, case FS: retval = child->thread.fs; break; - case GS: - retval = child->thread.gs; - break; case DS: case ES: + case GS: case SS: case CS: retval = 0xffff; /* fall through */ default: - if (regno > GS*4) - regno -= 2*4; + if (regno > ES*4) + regno -= 1*4; regno = regno - sizeof(struct pt_regs); retval &= get_stack_long(child, regno); } -- cgit v1.2.1 From 49d26b6eaa8e970c8cf6e299e6ccba2474191bf5 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Update sys_vm86 to cope with changed pt_regs and %gs usage sys_vm86 uses a struct kernel_vm86_regs, which is identical to pt_regs, but adds an extra space for all the segment registers. Previously this structure was completely independent, so changes in pt_regs had to be reflected in kernel_vm86_regs. This changes just embeds pt_regs in kernel_vm86_regs, and makes the appropriate changes to vm86.c to deal with the new naming. Also, since %gs is dealt with differently in the kernel, this change adjusts vm86.c to reflect this. While making these changes, I also cleaned up some frankly bizarre code which was added when auditing was added to sys_vm86. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Cc: Al Viro Cc: Jason Baron Cc: Chris Wright Signed-off-by: Andrew Morton --- arch/i386/kernel/vm86.c | 121 +++++++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 47 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index cbcd61d6120b..be2f96e67f78 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -72,10 +73,10 @@ /* * 8- and 16-bit register defines.. */ -#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) -#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) -#define IP(regs) (*(unsigned short *)&((regs)->eip)) -#define SP(regs) (*(unsigned short *)&((regs)->esp)) +#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) /* * virtual flags (16 and 32-bit versions) @@ -89,10 +90,37 @@ #define SAFE_MASK (0xDD5) #define RETURN_MASK (0xDFF) -#define VM86_REGS_PART2 orig_eax -#define VM86_REGS_SIZE1 \ - ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) -#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) +/* convert kernel_vm86_regs to vm86_regs */ +static int copy_vm86_regs_to_user(struct vm86_regs __user *user, + const struct kernel_vm86_regs *regs) +{ + int ret = 0; + + /* kernel_vm86_regs is missing xfs, so copy everything up to + (but not including) xgs, and then rest after xgs. */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); + ret += copy_to_user(&user->__null_gs, ®s->pt.xgs, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.xgs)); + + return ret; +} + +/* convert vm86_regs to kernel_vm86_regs */ +static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, + const struct vm86_regs __user *user, + unsigned extra) +{ + int ret = 0; + + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); + ret += copy_from_user(®s->pt.xgs, &user->__null_gs, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.xgs) + + extra); + + return ret; +} struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) @@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); - tmp = copy_to_user(¤t->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); - tmp += copy_to_user(¤t->thread.vm86_info->regs.VM86_REGS_PART2, - ®s->VM86_REGS_PART2, VM86_REGS_SIZE2); + set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { printk("vm86: could not access userspace vm86_info\n"); @@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) current->thread.saved_esp0 = 0; put_cpu(); - loadsegment(fs, current->thread.saved_fs); - loadsegment(gs, current->thread.saved_gs); ret = KVM86->regs32; + + loadsegment(fs, current->thread.saved_fs); + ret->xgs = current->thread.saved_gs; + return ret; } @@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs) tsk = current; if (tsk->thread.saved_esp0) goto out; - tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); - tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, - (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, vm86plus) - + sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; @@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs) if (tsk->thread.saved_esp0) goto out; v86 = (struct vm86plus_struct __user *)regs.ecx; - tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); - tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, - (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, regs32) - + sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; @@ -252,15 +280,15 @@ out: static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) { struct tss_struct *tss; - long eax; /* * make sure the vm86() system call doesn't try to do anything silly */ - info->regs.__null_ds = 0; - info->regs.__null_es = 0; + info->regs.pt.xds = 0; + info->regs.pt.xes = 0; + info->regs.pt.xgs = 0; -/* we are clearing fs,gs later just before "jmp resume_userspace", - * because starting with Linux 2.1.x they aren't no longer saved/restored +/* we are clearing fs later just before "jmp resume_userspace", + * because it is not saved/restored. */ /* @@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.eflags; - info->regs.eflags &= SAFE_MASK; - info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; - info->regs.eflags |= VM_MASK; + VEFLAGS = info->regs.pt.eflags; + info->regs.pt.eflags &= SAFE_MASK; + info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; + info->regs.pt.eflags |= VM_MASK; switch (info->cpu_type) { case CPU_286: @@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs32->eax = 0; tsk->thread.saved_esp0 = tsk->thread.esp0; savesegment(fs, tsk->thread.saved_fs); - savesegment(gs, tsk->thread.saved_gs); + tsk->thread.saved_gs = info->regs32->xgs; tss = &per_cpu(init_tss, get_cpu()); tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; @@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t"); - __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax)); /*call audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(eax), eax); + audit_syscall_exit(AUDITSC_RESULT(0), 0); __asm__ __volatile__( "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" + "mov %2, %%fs\n\t" "jmp resume_userspace" : /* no outputs */ - :"r" (&info->regs), "r" (task_thread_info(tsk))); + :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); /* we never return here */ } @@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs) static inline void clear_TF(struct kernel_vm86_regs * regs) { - regs->eflags &= ~TF_MASK; + regs->pt.eflags &= ~TF_MASK; } static inline void clear_AC(struct kernel_vm86_regs * regs) { - regs->eflags &= ~AC_MASK; + regs->pt.eflags &= ~AC_MASK; } /* It is correct to call set_IF(regs) from the set_vflags_* @@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs) static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) { set_flags(VEFLAGS, eflags, current->thread.v86mask); - set_flags(regs->eflags, eflags, SAFE_MASK); + set_flags(regs->pt.eflags, eflags, SAFE_MASK); if (eflags & IF_MASK) set_IF(regs); else @@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { set_flags(VFLAGS, flags, current->thread.v86mask); - set_flags(regs->eflags, flags, SAFE_MASK); + set_flags(regs->pt.eflags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); else @@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) { - unsigned long flags = regs->eflags & RETURN_MASK; + unsigned long flags = regs->pt.eflags & RETURN_MASK; if (VEFLAGS & VIF_MASK) flags |= IF_MASK; @@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, unsigned long __user *intr_ptr; unsigned long segoffs; - if (regs->cs == BIOSSEG) + if (regs->pt.xcs == BIOSSEG) goto cannot_handle; if (is_revectored(i, &KVM86->int_revectored)) goto cannot_handle; @@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i, if ((segoffs >> 16) == BIOSSEG) goto cannot_handle; pushw(ssp, sp, get_vflags(regs), cannot_handle); - pushw(ssp, sp, regs->cs, cannot_handle); + pushw(ssp, sp, regs->pt.xcs, cannot_handle); pushw(ssp, sp, IP(regs), cannot_handle); - regs->cs = segoffs >> 16; + regs->pt.xcs = segoffs >> 16; SP(regs) -= 6; IP(regs) = segoffs & 0xffff; clear_TF(regs); @@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno if (VMPI.is_vm86pus) { if ( (trapno==3) || (trapno==1) ) return_to_32bit(regs, VM86_TRAP + (trapno << 8)); - do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); + do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); return 0; } if (trapno !=1) @@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) handle_vm86_trap(regs, 0, 1); \ return; } while (0) - orig_flags = *(unsigned short *)®s->eflags; + orig_flags = *(unsigned short *)®s->pt.eflags; - csp = (unsigned char __user *) (regs->cs << 4); - ssp = (unsigned char __user *) (regs->ss << 4); + csp = (unsigned char __user *) (regs->pt.xcs << 4); + ssp = (unsigned char __user *) (regs->pt.xss << 4); sp = SP(regs); ip = IP(regs); @@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) SP(regs) += 6; } IP(regs) = newip; - regs->cs = newcs; + regs->pt.xcs = newcs; CHECK_IF_IN_TRAP; if (data32) { set_vflags_long(newflags, regs); -- cgit v1.2.1 From b2938f880890ebfcccad356275e0000193153623 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Implement smp_processor_id() with the PDA Use the cpu_number in the PDA to implement raw_smp_processor_id. This is a little simpler than using thread_info, though the cpu field in thread_info cannot be removed since it is used for things other than getting the current CPU in common code. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 4 +++- arch/i386/kernel/cpu/common.c | 2 ++ arch/i386/kernel/entry.S | 3 +-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 9620872d3534..85f1b038e9c3 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -51,7 +51,6 @@ void foo(void) OFFSET(TI_exec_domain, thread_info, exec_domain); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); @@ -97,4 +96,7 @@ void foo(void) DEFINE(VDSO_PRELINK, VDSO_PRELINK); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(PDA_cpu, i386_pda, cpu_number); } diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 4e63d8ce602b..e476202b887f 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -650,6 +650,7 @@ __cpuinit int alloc_gdt(int cpu) /* Initial PDA used by boot CPU */ struct i386_pda boot_pda = { ._pda = &boot_pda, + .cpu_number = 0, }; static inline void set_kernel_gs(void) @@ -694,6 +695,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) memset(pda, 0, sizeof(*pda)); pda->_pda = pda; + pda->cpu_number = cpu; return 1; } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index b99d4a160078..d7423efaeea4 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -524,8 +524,7 @@ syscall_badsys: #define FIXUP_ESPFIX_STACK \ /* since we are on a wrong stack, we cant make it a C code :( */ \ - GET_THREAD_INFO(%ebp); \ - movl TI_cpu(%ebp), %ebx; \ + movl %gs:PDA_cpu, %ebx; \ PER_CPU(cpu_gdt_descr, %ebx); \ movl GDS_address(%ebx), %ebx; \ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ -- cgit v1.2.1 From ec7fcaabbfb3c5bd5189f857b6ac7bb9745ef291 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Implement "current" with the PDA Use the pcurrent field in the PDA to implement the "current" macro. This ends up compiling down to a single instruction to get the current task. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/asm-offsets.c | 2 ++ arch/i386/kernel/cpu/common.c | 2 ++ arch/i386/kernel/process.c | 1 + 3 files changed, 5 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 85f1b038e9c3..0666eb0ed7bc 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -15,6 +15,7 @@ #include #include #include +#include #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -99,4 +100,5 @@ void foo(void) BLANK(); OFFSET(PDA_cpu, i386_pda, cpu_number); + OFFSET(PDA_pcurrent, i386_pda, pcurrent); } diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index e476202b887f..6958ae5e2fa5 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -651,6 +651,7 @@ __cpuinit int alloc_gdt(int cpu) struct i386_pda boot_pda = { ._pda = &boot_pda, .cpu_number = 0, + .pcurrent = &init_task, }; static inline void set_kernel_gs(void) @@ -696,6 +697,7 @@ __cpuinit int init_gdt(int cpu, struct task_struct *idle) memset(pda, 0, sizeof(*pda)); pda->_pda = pda; pda->cpu_number = cpu; + pda->pcurrent = idle; return 1; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index dc4272545179..8749b10d3805 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -684,6 +684,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); + write_pda(pcurrent, next_p); /* * Restore IOPL if needed. -- cgit v1.2.1 From 72690a21188586022a9e65cb6f1cc8845167555a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] x86: Don't use nested idle loops Currently the idle loop has two nested loops -- one high level in cpu_idle and in some low level idle functions another one. Looping in the low level idle functions breaks the idle notifiers because interrupts waking up sleep states need to execute exit_idle() which is only in cpu_idle(). So don't do that, only loop in cpu_idle(). This only removes code. In some cases e.g. poll_idle the idle loop is a little longer now because cpu_idle checks more things. I hope that isn't a problem ACPI idle doesn't change behaviour because it never looped anyways. Cc: len.brown@intel.com Cc: eranian@hpl.hp.com Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8749b10d3805..8f42659ef9d2 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -100,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt); */ void default_idle(void) { - local_irq_enable(); - if (!hlt_counter && boot_cpu_data.hlt_works_ok) { current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); - } + local_irq_disable(); + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { - while (!need_resched()) - cpu_relax(); + /* loop is done by the caller */ + cpu_relax(); } } #ifdef CONFIG_APM_MODULE @@ -129,14 +125,7 @@ EXPORT_SYMBOL(default_idle); */ static void poll_idle (void) { - local_irq_enable(); - - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + cpu_relax(); } #ifdef CONFIG_HOTPLUG_CPU @@ -257,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) - mwait_idle_with_hints(0, 0); + mwait_idle_with_hints(0, 0); } void __devinit select_idle_routine(const struct cpuinfo_x86 *c) -- cgit v1.2.1 From 7cd8b6861eb586aabe4c725cc0c259ce2e653695 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] x86: remove last two pci_find offenders in the core code Resending as I believe the discussion about them established they were correct. Signed-off-by: Alan Cox Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/pci/irq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c index e65551cd8216..f2cb942f8281 100644 --- a/arch/i386/pci/irq.c +++ b/arch/i386/pci/irq.c @@ -764,7 +764,7 @@ static void __init pirq_find_router(struct irq_router *r) DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", rt->rtr_vendor, rt->rtr_device); - pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); + pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); if (!pirq_router_dev) { DBG(KERN_DEBUG "PCI: Interrupt router not found at " "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); @@ -784,6 +784,8 @@ static void __init pirq_find_router(struct irq_router *r) pirq_router_dev->vendor, pirq_router_dev->device, pci_name(pirq_router_dev)); + + /* The device remains referenced for the kernel lifetime */ } static struct irq_info *pirq_get_info(struct pci_dev *dev) -- cgit v1.2.1 From 9c5f8be4625e73f17e28fea89399ed871a30e064 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] x86: Mention PCI instead of RAM in NMI parity error message On modern systems RAM errors don't cause NMIs, but it's usually caused by PCI SERR. Mention PCI instead of RAM in the printk. Reported by r_hayashi@ctc-g.co.jp (Ryutaro Hayashi) Cc: r_hayashi@ctc-g.co.jp Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 4a6fa2837df2..237f4884a1e1 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -708,8 +708,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " "CPU %d.\n", reason, smp_processor_id()); - printk(KERN_EMERG "You probably have a hardware problem with your RAM " - "chips\n"); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); -- cgit v1.2.1 From 6569580de7ae367def89b7671029cb97c1965574 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Distinguish absolute symbols Ld knows about 2 kinds of symbols, absolute and section relative. Section relative symbols symbols change value when a section is moved and absolute symbols do not. Currently in the linker script we have several labels marking the beginning and ending of sections that are outside of sections, making them absolute symbols. Having a mixture of absolute and section relative symbols refereing to the same data is currently harmless but it is confusing. This must be done carefully as newer revs of ld do not place symbols that appear in sections without data and instead ld makes those symbols global :( My ultimate goal is to build a relocatable kernel. The safest and least intrusive technique is to generate relocation entries so the kernel can be relocated at load time. The only penalty would be an increase in the size of the kernel binary. The problem is that if absolute and relocatable symbols are not properly specified absolute symbols will be relocated or section relative symbols won't be, which is fatal. The practical motivation is that when generating kernels that will run from a reserved area for analyzing what caused a kernel panic, it is simpler if you don't need to hard code the physical memory location they will run at, especially for the distributions. [AK: and merged:] o Also put a message so that in future people can be aware of it and avoid introducing absolute symbols. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 113 +++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 50 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index c6f84a0322ba..cbd24860fbb7 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -1,5 +1,11 @@ /* ld script to make i386 Linux kernel * Written by Martin Mares ; + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. */ #define LOAD_OFFSET __PAGE_OFFSET @@ -24,31 +30,32 @@ SECTIONS . = __KERNEL_START; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ - _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ *(.text) SCHED_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) *(.gnu.warning) - } :text = 0x9090 - - _etext = .; /* End of text section */ + _etext = .; /* End of text section */ + } :text = 0x9090 . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } RODATA . = ALIGN(4); - __tracedata_start = .; .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + __tracedata_start = .; *(.tracedata) + __tracedata_end = .; } - __tracedata_end = .; /* writeable */ . = ALIGN(4096); @@ -58,10 +65,12 @@ SECTIONS } :data . = ALIGN(4096); - __nosave_begin = .; - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } - . = ALIGN(4096); - __nosave_end = .; + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(4096); + __nosave_end = .; + } . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { @@ -75,8 +84,10 @@ SECTIONS /* rarely changed data like cpu maps */ . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } - _edata = .; /* End of data section */ + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + _edata = .; /* End of data section */ + } #ifdef CONFIG_STACK_UNWIND . = ALIGN(4); @@ -94,54 +105,56 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); - __smp_alt_begin = .; - __smp_alt_instructions = .; .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { + __smp_alt_begin = .; + __smp_alt_instructions = .; *(.smp_altinstructions) + __smp_alt_instructions_end = .; } - __smp_alt_instructions_end = .; . = ALIGN(4); - __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; *(.smp_locks) + __smp_locks_end = .; } - __smp_locks_end = .; .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { *(.smp_altinstr_replacement) + __smp_alt_end = .; } . = ALIGN(4096); - __smp_alt_end = .; /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ - __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; _sinittext = .; *(.init.text) _einittext = .; } .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } . = ALIGN(16); - __setup_start = .; - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; INITCALLS + __initcall_end = .; } - __initcall_end = .; - __con_initcall_start = .; .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; *(.con_initcall.init) + __con_initcall_end = .; } - __con_initcall_end = .; SECURITY_INIT . = ALIGN(4); - __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; *(.altinstructions) + __alt_instructions_end = .; } - __alt_instructions_end = .; .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } @@ -150,32 +163,32 @@ SECTIONS .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); - __initramfs_start = .; - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } - __initramfs_end = .; + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } . = ALIGN(L1_CACHE_BYTES); - __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } - __per_cpu_end = .; + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { + __per_cpu_start = .; + *(.data.percpu) + __per_cpu_end = .; + } . = ALIGN(4096); - __init_end = .; /* freed after init ends here */ - __bss_start = .; /* BSS */ - .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) { - *(.bss.page_aligned) - } .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; /* BSS */ + *(.bss.page_aligned) *(.bss) + . = ALIGN(4); + __bss_stop = .; + _end = . ; + /* This is where the kernel creates the early boot page tables */ + . = ALIGN(4096); + pg0 = . ; } - . = ALIGN(4); - __bss_stop = .; - - _end = . ; - - /* This is where the kernel creates the early boot page tables */ - . = ALIGN(4096); - pg0 = .; /* Sections to be discarded */ /DISCARD/ : { -- cgit v1.2.1 From 6ed018845f1172cdc94f8a20ad807df901c6b7eb Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Add comment for align to vmlinux.lds Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index cbd24860fbb7..c217e18f1081 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -121,6 +121,12 @@ SECTIONS *(.smp_altinstr_replacement) __smp_alt_end = .; } + /* will be freed after init + * Following ALIGN() is required to make sure no other data falls on the + * same page where __smp_alt_end is pointing as that page might be freed + * after boot. Always make sure that ALIGN() directive is present after + * the section which contains __smp_alt_end. + */ . = ALIGN(4096); /* will be freed after init */ -- cgit v1.2.1 From 8621b81c744ff8880a1efe095a4dcd09763ddb5a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:03 +0100 Subject: [PATCH] i386: Reserve kernel memory starting from _text Currently when we are reserving the memory the kernel text resides in we start at __PHYSICAL_START which happens to be correct but not very obvious. In addition when we start relocating the kernel __PHYSICAL_START is the wrong value, as it is an absolute symbol that does not get relocated. By starting the reservation at __pa_symbol(_text) the code is clearer and will be correct when relocated. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 141041dde74d..61539afbdf23 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1118,8 +1118,8 @@ void __init setup_bootmem_allocator(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); + reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, -- cgit v1.2.1 From 2a43f3ede48ea3d5790b863b719a1e21c90a3697 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: CONFIG_PHYSICAL_START cleanup Defining __PHYSICAL_START and __KERNEL_START in asm-i386/page.h works but it triggers a full kernel rebuild for the silliest of reasons. This modifies the users to directly use CONFIG_PHYSICAL_START and linux/config.h which prevents the full rebuild problem, which makes the code much more maintainer and hopefully user friendly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/boot/compressed/head.S | 7 +++---- arch/i386/boot/compressed/misc.c | 7 +++---- arch/i386/kernel/vmlinux.lds.S | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index b5893e4ecd37..40a8de8270a9 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -25,7 +25,6 @@ #include #include -#include .globl startup_32 @@ -75,7 +74,7 @@ startup_32: popl %esi # discard address popl %esi # real mode pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START /* * We come here, if we were loaded high. @@ -100,7 +99,7 @@ startup_32: popl %ecx # lcount popl %edx # high_buffer_start popl %eax # hcount - movl $__PHYSICAL_START,%edi + movl $CONFIG_PHYSICAL_START,%edi cli # make sure we don't get interrupted ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine @@ -125,5 +124,5 @@ move_routine_start: movsl movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $__PHYSICAL_START + ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START move_routine_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index b2ccd543410d..20970ff44119 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -13,7 +13,6 @@ #include #include #include -#include /* * gzip declarations @@ -303,7 +302,7 @@ static void setup_normal_output_buffer(void) #else if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif - output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ + output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; } @@ -326,8 +325,8 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv) low_buffer_size = low_buffer_end - LOW_BUFFER_START; high_loaded = 1; free_mem_end_ptr = (long)high_buffer_start; - if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); + if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { + high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size); mv->hcount = 0; /* say: we need not to move high_buffer */ } else mv->hcount = -1; diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index c217e18f1081..f8d61ec4c8cb 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -27,7 +27,7 @@ PHDRS { } SECTIONS { - . = __KERNEL_START; + . = LOAD_OFFSET + CONFIG_PHYSICAL_START; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { -- cgit v1.2.1 From 968de4f02621db35b8ae5239c8cfc6664fb872d8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Relocatable kernel support This patch modifies the i386 kernel so that if CONFIG_RELOCATABLE is selected it will be able to be loaded at any 4K aligned address below 1G. The technique used is to compile the decompressor with -fPIC and modify it so the decompressor is fully relocatable. For the main kernel relocations are generated. Resulting in a kernel that is relocatable with no runtime overhead and no need to modify the source code. A reserved 32bit word in the parameters has been assigned to serve as a stack so we figure out where are running. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 12 + arch/i386/Makefile | 4 +- arch/i386/boot/compressed/Makefile | 28 +- arch/i386/boot/compressed/head.S | 184 +++++++---- arch/i386/boot/compressed/misc.c | 261 ++++++++-------- arch/i386/boot/compressed/relocs.c | 563 ++++++++++++++++++++++++++++++++++ arch/i386/boot/compressed/vmlinux.lds | 43 +++ arch/i386/boot/compressed/vmlinux.scr | 3 +- arch/i386/boot/setup.S | 29 +- 9 files changed, 918 insertions(+), 209 deletions(-) create mode 100644 arch/i386/boot/compressed/relocs.c create mode 100644 arch/i386/boot/compressed/vmlinux.lds (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 8ff1c6fb5aa1..d588ca874bb4 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -773,6 +773,18 @@ config CRASH_DUMP PHYSICAL_START. For more details see Documentation/kdump/kdump.txt +config RELOCATABLE + bool "Build a relocatable kernel" + help + This build a kernel image that retains relocation information + so it can be loaded someplace besides the default 1MB. + The relocations tend to the kernel binary about 10% larger, + but are discarded at runtime. + + One use is for the kexec on panic case where the recovery kernel + must live at a different physical address than the primary + kernel. + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) diff --git a/arch/i386/Makefile b/arch/i386/Makefile index 0677908dfa06..d1aca52bf690 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -26,7 +26,9 @@ endif LDFLAGS := -m elf_i386 OBJCOPYFLAGS := -O binary -R .note -R .comment -S -LDFLAGS_vmlinux := +ifdef CONFIG_RELOCATABLE +LDFLAGS_vmlinux := --emit-relocs +endif CHECKFLAGS += -D__i386__ CFLAGS += -pipe -msoft-float diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile index 258ea95224f6..cc28da3a881e 100644 --- a/arch/i386/boot/compressed/Makefile +++ b/arch/i386/boot/compressed/Makefile @@ -4,22 +4,42 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o +targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o \ + vmlinux.bin.all vmlinux.relocs EXTRA_AFLAGS := -traditional -LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 +LDFLAGS_vmlinux := -T +CFLAGS_misc.o += -fPIC +hostprogs-y := relocs -$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE +$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(obj)/relocs $< > $@ +$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE + $(call if_changed,relocs) + +vmlinux.bin.all-y := $(obj)/vmlinux.bin +vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs +quiet_cmd_relocbin = BUILD $@ + cmd_relocbin = cat $(filter-out FORCE,$^) > $@ +$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE + $(call if_changed,relocbin) + +ifdef CONFIG_RELOCATABLE +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE + $(call if_changed,gzip) +else $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) +endif LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE +$(obj)/piggy.o: $(src)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index 40a8de8270a9..e4dd7a6b9b0f 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -25,9 +25,11 @@ #include #include +#include +.section ".text.head" .globl startup_32 - + startup_32: cld cli @@ -36,93 +38,141 @@ startup_32: movl %eax,%es movl %eax,%fs movl %eax,%gs + movl %eax,%ss - lss stack_start,%esp - xorl %eax,%eax -1: incl %eax # check that A20 really IS enabled - movl %eax,0x000000 # loop forever if it isn't - cmpl %eax,0x100000 - je 1b +/* Calculate the delta between where we were compiled to run + * at and where we were actually loaded at. This can only be done + * with a short local call on x86. Nothing else will tell us what + * address we are running at. The reserved chunk of the real-mode + * data at 0x34-0x3f are used as the stack for this calculation. + * Only 4 bytes are needed. + */ + leal 0x40(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp + +/* Compute the delta between where we were compiled to run at + * and where the code will actually run at. + */ + /* Start with the delta to where the kernel will run at. If we are + * a relocatable kernel this is the delta to our load address otherwise + * this is the delta to CONFIG_PHYSICAL start. + */ +#ifdef CONFIG_RELOCATABLE + movl %ebp, %ebx +#else + movl $(CONFIG_PHYSICAL_START - startup_32), %ebx +#endif + + /* Replace the compressed data size with the uncompressed size */ + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx + /* Add 8 bytes for every 32K input block */ + shrl $12, %eax + addl %eax, %ebx + /* Add 32K + 18 bytes of extra slack */ + addl $(32768 + 18), %ebx + /* Align on a 4K boundary */ + addl $4095, %ebx + andl $~4095, %ebx + +/* Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. + */ + pushl %esi + leal _end(%ebp), %esi + leal _end(%ebx), %edi + movl $(_end - startup_32), %ecx + std + rep + movsb + cld + popl %esi + +/* Compute the kernel start address. + */ +#ifdef CONFIG_RELOCATABLE + leal startup_32(%ebp), %ebp +#else + movl $CONFIG_PHYSICAL_START, %ebp +#endif /* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. + * Jump to the relocated address. */ - pushl $0 - popfl + leal relocated(%ebx), %eax + jmp *%eax +.section ".text" +relocated: + /* * Clear BSS */ xorl %eax,%eax - movl $_edata,%edi - movl $_end,%ecx + leal _edata(%ebx),%edi + leal _end(%ebx), %ecx subl %edi,%ecx cld rep stosb + +/* + * Setup the stack for the decompressor + */ + leal stack_end(%ebx), %esp + /* * Do the decompression, and jump to the new kernel.. */ - subl $16,%esp # place for structure on the stack - movl %esp,%eax + movl output_len(%ebx), %eax + pushl %eax + pushl %ebp # output address + movl input_len(%ebx), %eax + pushl %eax # input_len + leal input_data(%ebx), %eax + pushl %eax # input_data + leal _end(%ebx), %eax + pushl %eax # end of the image as third argument pushl %esi # real mode pointer as second arg - pushl %eax # address of structure as first arg call decompress_kernel - orl %eax,%eax - jnz 3f - popl %esi # discard address - popl %esi # real mode pointer - xorl %ebx,%ebx - ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START + addl $20, %esp + popl %ecx + +#if CONFIG_RELOCATABLE +/* Find the address of the relocations. + */ + movl %ebp, %edi + addl %ecx, %edi + +/* Calculate the delta between where vmlinux was compiled to run + * and where it was actually loaded. + */ + movl %ebp, %ebx + subl $CONFIG_PHYSICAL_START, %ebx /* - * We come here, if we were loaded high. - * We need to move the move-in-place routine down to 0x1000 - * and then start it with the buffer addresses in registers, - * which we got from the stack. + * Process relocations. */ -3: - movl $move_routine_start,%esi - movl $0x1000,%edi - movl $move_routine_end,%ecx - subl %esi,%ecx - addl $3,%ecx - shrl $2,%ecx - cld - rep - movsl - - popl %esi # discard the address - popl %ebx # real mode pointer - popl %esi # low_buffer_start - popl %ecx # lcount - popl %edx # high_buffer_start - popl %eax # hcount - movl $CONFIG_PHYSICAL_START,%edi - cli # make sure we don't get interrupted - ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine + +1: subl $4, %edi + movl 0(%edi), %ecx + testl %ecx, %ecx + jz 2f + addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) + jmp 1b +2: +#endif /* - * Routine (template) for moving the decompressed kernel in place, - * if we were high loaded. This _must_ PIC-code ! + * Jump to the decompressed kernel. */ -move_routine_start: - movl %ecx,%ebp - shrl $2,%ecx - rep - movsl - movl %ebp,%ecx - andl $3,%ecx - rep - movsb - movl %edx,%esi - movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 - addl $3,%ecx - shrl $2,%ecx - rep - movsl - movl %ebx,%esi # Restore setup pointer xorl %ebx,%ebx - ljmp $(__BOOT_CS), $CONFIG_PHYSICAL_START -move_routine_end: + jmp *%ebp + +.bss +.balign 4 +stack: + .fill 4096, 1, 0 +stack_end: diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 20970ff44119..4eac24e95a10 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -13,6 +13,88 @@ #include #include #include +#include + +/* WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically + * at run time, but no relocation processing is performed. + * This means that it is not safe to place pointers in static structures. + */ + +/* + * Getting to provable safe in place decompression is hard. + * Worst case behaviours need to be analized. + * Background information: + * + * The file layout is: + * magic[2] + * method[1] + * flags[1] + * timestamp[4] + * extraflags[1] + * os[1] + * compressed data blocks[N] + * crc[4] orig_len[4] + * + * resulting in 18 bytes of non compressed data overhead. + * + * Files divided into blocks + * 1 bit (last block flag) + * 2 bits (block type) + * + * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. + * The smallest block type encoding is always used. + * + * stored: + * 32 bits length in bytes. + * + * fixed: + * magic fixed tree. + * symbols. + * + * dynamic: + * dynamic tree encoding. + * symbols. + * + * + * The buffer for decompression in place is the length of the + * uncompressed data, plus a small amount extra to keep the algorithm safe. + * The compressed data is placed at the end of the buffer. The output + * pointer is placed at the start of the buffer and the input pointer + * is placed where the compressed data starts. Problems will occur + * when the output pointer overruns the input pointer. + * + * The output pointer can only overrun the input pointer if the input + * pointer is moving faster than the output pointer. A condition only + * triggered by data whose compressed form is larger than the uncompressed + * form. + * + * The worst case at the block level is a growth of the compressed data + * of 5 bytes per 32767 bytes. + * + * The worst case internal to a compressed block is very hard to figure. + * The worst case can at least be boundined by having one bit that represents + * 32764 bytes and then all of the rest of the bytes representing the very + * very last byte. + * + * All of which is enough to compute an amount of extra data that is required + * to be safe. To avoid problems at the block level allocating 5 extra bytes + * per 32767 bytes of data is sufficient. To avoind problems internal to a block + * adding an extra 32767 bytes (the worst case uncompressed block size) is + * sufficient, to ensure that in the worst case the decompressed data for + * block will stop the byte before the compressed data for a block begins. + * To avoid problems with the compressed data's meta information an extra 18 + * bytes are needed. Leading to the formula: + * + * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. + * + * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. + * Adding 32768 instead of 32767 just makes for round numbers. + * Adding the decompressor_size is necessary as it musht live after all + * of the data as well. Last I measured the decompressor is about 14K. + * 10K of actuall data and 4K of bss. + * + */ /* * gzip declarations @@ -29,15 +111,20 @@ typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; -#define WSIZE 0x8000 /* Window size must be at least 32k, */ - /* and a power of two */ +#define WSIZE 0x80000000 /* Window size must be at least 32k, + * and a power of two + * We don't actually have a window just + * a huge output buffer so I report + * a 2G windows size, as that should + * always be larger than our output buffer. + */ -static uch *inbuf; /* input buffer */ -static uch window[WSIZE]; /* Sliding window buffer */ +static uch *inbuf; /* input buffer */ +static uch *window; /* Sliding window buffer, (and final output buffer) */ -static unsigned insize = 0; /* valid bytes in inbuf */ -static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ -static unsigned outcnt = 0; /* bytes in output buffer */ +static unsigned insize; /* valid bytes in inbuf */ +static unsigned inptr; /* index of next byte to be processed in inbuf */ +static unsigned outcnt; /* bytes in output buffer */ /* gzip flag byte */ #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ @@ -88,8 +175,6 @@ extern unsigned char input_data[]; extern int input_len; static long bytes_out = 0; -static uch *output_data; -static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); @@ -99,17 +184,10 @@ static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); -extern int end; -static long free_mem_ptr = (long)&end; -static long free_mem_end_ptr; +static unsigned long free_mem_ptr; +static unsigned long free_mem_end_ptr; -#define INPLACE_MOVE_ROUTINE 0x1000 -#define LOW_BUFFER_START 0x2000 -#define LOW_BUFFER_MAX 0x90000 #define HEAP_SIZE 0x3000 -static unsigned int low_buffer_end, low_buffer_size; -static int high_loaded =0; -static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; static char *vidmem = (char *)0xb8000; static int vidport; @@ -150,7 +228,7 @@ static void gzip_mark(void **ptr) static void gzip_release(void **ptr) { - free_mem_ptr = (long) *ptr; + free_mem_ptr = (unsigned long) *ptr; } static void scroll(void) @@ -178,7 +256,7 @@ static void putstr(const char *s) y--; } } else { - vidmem [ ( x + cols * y ) * 2 ] = c; + vidmem [ ( x + cols * y ) * 2 ] = c; if ( ++x >= cols ) { x = 0; if ( ++y >= lines ) { @@ -223,58 +301,31 @@ static void* memcpy(void* dest, const void* src, unsigned n) */ static int fill_inbuf(void) { - if (insize != 0) { - error("ran out of input data"); - } - - inbuf = input_data; - insize = input_len; - inptr = 1; - return inbuf[0]; + error("ran out of input data"); + return 0; } /* =========================================================================== * Write the output window window[0..outcnt-1] and update crc and bytes_out. * (Used for the decompressed data only.) */ -static void flush_window_low(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, *out, ch; - - in = window; - out = &output_data[output_ptr]; - for (n = 0; n < outcnt; n++) { - ch = *out++ = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - output_ptr += (ulg)outcnt; - outcnt = 0; -} - -static void flush_window_high(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - in = window; - for (n = 0; n < outcnt; n++) { - ch = *output_data++ = *in++; - if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - static void flush_window(void) { - if (high_loaded) flush_window_high(); - else flush_window_low(); + /* With my window equal to my output buffer + * I only need to compute the crc here. + */ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, ch; + + in = window; + for (n = 0; n < outcnt; n++) { + ch = *in++; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + outcnt = 0; } static void error(char *x) @@ -286,66 +337,8 @@ static void error(char *x) while(1); /* Halt */ } -#define STACK_SIZE (4096) - -long user_stack [STACK_SIZE]; - -struct { - long * a; - short b; - } stack_start = { & user_stack [STACK_SIZE] , __BOOT_DS }; - -static void setup_normal_output_buffer(void) -{ -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); -#endif - output_data = (unsigned char *)CONFIG_PHYSICAL_START; /* Normally Points to 1M */ - free_mem_end_ptr = (long)real_mode; -} - -struct moveparams { - uch *low_buffer_start; int lcount; - uch *high_buffer_start; int hcount; -}; - -static void setup_output_buffer_if_we_run_high(struct moveparams *mv) -{ - high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); -#endif - mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; - low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX - ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; - low_buffer_size = low_buffer_end - LOW_BUFFER_START; - high_loaded = 1; - free_mem_end_ptr = (long)high_buffer_start; - if ( (CONFIG_PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(CONFIG_PHYSICAL_START + low_buffer_size); - mv->hcount = 0; /* say: we need not to move high_buffer */ - } - else mv->hcount = -1; - mv->high_buffer_start = high_buffer_start; -} - -static void close_output_buffer_if_we_run_high(struct moveparams *mv) -{ - if (bytes_out > low_buffer_size) { - mv->lcount = low_buffer_size; - if (mv->hcount) - mv->hcount = bytes_out - low_buffer_size; - } else { - mv->lcount = bytes_out; - mv->hcount = 0; - } -} - -asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode) +asmlinkage void decompress_kernel(void *rmode, unsigned long end, + uch *input_data, unsigned long input_len, uch *output) { real_mode = rmode; @@ -360,13 +353,25 @@ asmlinkage int decompress_kernel(struct moveparams *mv, void *rmode) lines = RM_SCREEN_INFO.orig_video_lines; cols = RM_SCREEN_INFO.orig_video_cols; - if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); - else setup_output_buffer_if_we_run_high(mv); + window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = end; /* Heap */ + free_mem_end_ptr = end + HEAP_SIZE; + inbuf = input_data; /* Input buffer */ + insize = input_len; + inptr = 0; + + if (((u32)output - CONFIG_PHYSICAL_START) & 0x3fffff) + error("Destination address not 4M aligned"); + if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) + error("Destination address too large"); +#ifndef CONFIG_RELOCATABLE + if ((u32)output != CONFIG_PHYSICAL_START) + error("Wrong destination address"); +#endif makecrc(); putstr("Uncompressing Linux... "); gunzip(); putstr("Ok, booting the kernel.\n"); - if (high_loaded) close_output_buffer_if_we_run_high(mv); - return high_loaded; + return; } diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c new file mode 100644 index 000000000000..0551ceb21bed --- /dev/null +++ b/arch/i386/boot/compressed/relocs.c @@ -0,0 +1,563 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define USE_BSD +#include + +#define MAX_SHDRS 100 +static Elf32_Ehdr ehdr; +static Elf32_Shdr shdr[MAX_SHDRS]; +static Elf32_Sym *symtab[MAX_SHDRS]; +static Elf32_Rel *reltab[MAX_SHDRS]; +static char *strtab[MAX_SHDRS]; +static unsigned long reloc_count, reloc_idx; +static unsigned long *relocs; + +static void die(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(1); +} + +static const char *sym_type(unsigned type) +{ + static const char *type_name[] = { +#define SYM_TYPE(X) [X] = #X + SYM_TYPE(STT_NOTYPE), + SYM_TYPE(STT_OBJECT), + SYM_TYPE(STT_FUNC), + SYM_TYPE(STT_SECTION), + SYM_TYPE(STT_FILE), + SYM_TYPE(STT_COMMON), + SYM_TYPE(STT_TLS), +#undef SYM_TYPE + }; + const char *name = "unknown sym type name"; + if (type < sizeof(type_name)/sizeof(type_name[0])) { + name = type_name[type]; + } + return name; +} + +static const char *sym_bind(unsigned bind) +{ + static const char *bind_name[] = { +#define SYM_BIND(X) [X] = #X + SYM_BIND(STB_LOCAL), + SYM_BIND(STB_GLOBAL), + SYM_BIND(STB_WEAK), +#undef SYM_BIND + }; + const char *name = "unknown sym bind name"; + if (bind < sizeof(bind_name)/sizeof(bind_name[0])) { + name = bind_name[bind]; + } + return name; +} + +static const char *sym_visibility(unsigned visibility) +{ + static const char *visibility_name[] = { +#define SYM_VISIBILITY(X) [X] = #X + SYM_VISIBILITY(STV_DEFAULT), + SYM_VISIBILITY(STV_INTERNAL), + SYM_VISIBILITY(STV_HIDDEN), + SYM_VISIBILITY(STV_PROTECTED), +#undef SYM_VISIBILITY + }; + const char *name = "unknown sym visibility name"; + if (visibility < sizeof(visibility_name)/sizeof(visibility_name[0])) { + name = visibility_name[visibility]; + } + return name; +} + +static const char *rel_type(unsigned type) +{ + static const char *type_name[] = { +#define REL_TYPE(X) [X] = #X + REL_TYPE(R_386_NONE), + REL_TYPE(R_386_32), + REL_TYPE(R_386_PC32), + REL_TYPE(R_386_GOT32), + REL_TYPE(R_386_PLT32), + REL_TYPE(R_386_COPY), + REL_TYPE(R_386_GLOB_DAT), + REL_TYPE(R_386_JMP_SLOT), + REL_TYPE(R_386_RELATIVE), + REL_TYPE(R_386_GOTOFF), + REL_TYPE(R_386_GOTPC), +#undef REL_TYPE + }; + const char *name = "unknown type rel type name"; + if (type < sizeof(type_name)/sizeof(type_name[0])) { + name = type_name[type]; + } + return name; +} + +static const char *sec_name(unsigned shndx) +{ + const char *sec_strtab; + const char *name; + sec_strtab = strtab[ehdr.e_shstrndx]; + name = ""; + if (shndx < ehdr.e_shnum) { + name = sec_strtab + shdr[shndx].sh_name; + } + else if (shndx == SHN_ABS) { + name = "ABSOLUTE"; + } + else if (shndx == SHN_COMMON) { + name = "COMMON"; + } + return name; +} + +static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym) +{ + const char *name; + name = ""; + if (sym->st_name) { + name = sym_strtab + sym->st_name; + } + else { + name = sec_name(shdr[sym->st_shndx].sh_name); + } + return name; +} + + + +#if BYTE_ORDER == LITTLE_ENDIAN +#define le16_to_cpu(val) (val) +#define le32_to_cpu(val) (val) +#endif +#if BYTE_ORDER == BIG_ENDIAN +#define le16_to_cpu(val) bswap_16(val) +#define le32_to_cpu(val) bswap_32(val) +#endif + +static uint16_t elf16_to_cpu(uint16_t val) +{ + return le16_to_cpu(val); +} + +static uint32_t elf32_to_cpu(uint32_t val) +{ + return le32_to_cpu(val); +} + +static void read_ehdr(FILE *fp) +{ + if (fread(&ehdr, sizeof(ehdr), 1, fp) != 1) { + die("Cannot read ELF header: %s\n", + strerror(errno)); + } + if (memcmp(ehdr.e_ident, ELFMAG, 4) != 0) { + die("No ELF magic\n"); + } + if (ehdr.e_ident[EI_CLASS] != ELFCLASS32) { + die("Not a 32 bit executable\n"); + } + if (ehdr.e_ident[EI_DATA] != ELFDATA2LSB) { + die("Not a LSB ELF executable\n"); + } + if (ehdr.e_ident[EI_VERSION] != EV_CURRENT) { + die("Unknown ELF version\n"); + } + /* Convert the fields to native endian */ + ehdr.e_type = elf16_to_cpu(ehdr.e_type); + ehdr.e_machine = elf16_to_cpu(ehdr.e_machine); + ehdr.e_version = elf32_to_cpu(ehdr.e_version); + ehdr.e_entry = elf32_to_cpu(ehdr.e_entry); + ehdr.e_phoff = elf32_to_cpu(ehdr.e_phoff); + ehdr.e_shoff = elf32_to_cpu(ehdr.e_shoff); + ehdr.e_flags = elf32_to_cpu(ehdr.e_flags); + ehdr.e_ehsize = elf16_to_cpu(ehdr.e_ehsize); + ehdr.e_phentsize = elf16_to_cpu(ehdr.e_phentsize); + ehdr.e_phnum = elf16_to_cpu(ehdr.e_phnum); + ehdr.e_shentsize = elf16_to_cpu(ehdr.e_shentsize); + ehdr.e_shnum = elf16_to_cpu(ehdr.e_shnum); + ehdr.e_shstrndx = elf16_to_cpu(ehdr.e_shstrndx); + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + die("Unsupported ELF header type\n"); + } + if (ehdr.e_machine != EM_386) { + die("Not for x86\n"); + } + if (ehdr.e_version != EV_CURRENT) { + die("Unknown ELF version\n"); + } + if (ehdr.e_ehsize != sizeof(Elf32_Ehdr)) { + die("Bad Elf header size\n"); + } + if (ehdr.e_phentsize != sizeof(Elf32_Phdr)) { + die("Bad program header entry\n"); + } + if (ehdr.e_shentsize != sizeof(Elf32_Shdr)) { + die("Bad section header entry\n"); + } + if (ehdr.e_shstrndx >= ehdr.e_shnum) { + die("String table index out of bounds\n"); + } +} + +static void read_shdrs(FILE *fp) +{ + int i; + if (ehdr.e_shnum > MAX_SHDRS) { + die("%d section headers supported: %d\n", + ehdr.e_shnum, MAX_SHDRS); + } + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + ehdr.e_shoff, strerror(errno)); + } + if (fread(&shdr, sizeof(shdr[0]), ehdr.e_shnum, fp) != ehdr.e_shnum) { + die("Cannot read ELF section headers: %s\n", + strerror(errno)); + } + for(i = 0; i < ehdr.e_shnum; i++) { + shdr[i].sh_name = elf32_to_cpu(shdr[i].sh_name); + shdr[i].sh_type = elf32_to_cpu(shdr[i].sh_type); + shdr[i].sh_flags = elf32_to_cpu(shdr[i].sh_flags); + shdr[i].sh_addr = elf32_to_cpu(shdr[i].sh_addr); + shdr[i].sh_offset = elf32_to_cpu(shdr[i].sh_offset); + shdr[i].sh_size = elf32_to_cpu(shdr[i].sh_size); + shdr[i].sh_link = elf32_to_cpu(shdr[i].sh_link); + shdr[i].sh_info = elf32_to_cpu(shdr[i].sh_info); + shdr[i].sh_addralign = elf32_to_cpu(shdr[i].sh_addralign); + shdr[i].sh_entsize = elf32_to_cpu(shdr[i].sh_entsize); + } + +} + +static void read_strtabs(FILE *fp) +{ + int i; + for(i = 0; i < ehdr.e_shnum; i++) { + if (shdr[i].sh_type != SHT_STRTAB) { + continue; + } + strtab[i] = malloc(shdr[i].sh_size); + if (!strtab[i]) { + die("malloc of %d bytes for strtab failed\n", + shdr[i].sh_size); + } + if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + shdr[i].sh_offset, strerror(errno)); + } + if (fread(strtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + } +} + +static void read_symtabs(FILE *fp) +{ + int i,j; + for(i = 0; i < ehdr.e_shnum; i++) { + if (shdr[i].sh_type != SHT_SYMTAB) { + continue; + } + symtab[i] = malloc(shdr[i].sh_size); + if (!symtab[i]) { + die("malloc of %d bytes for symtab failed\n", + shdr[i].sh_size); + } + if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + shdr[i].sh_offset, strerror(errno)); + } + if (fread(symtab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for(j = 0; j < shdr[i].sh_size/sizeof(symtab[i][0]); j++) { + symtab[i][j].st_name = elf32_to_cpu(symtab[i][j].st_name); + symtab[i][j].st_value = elf32_to_cpu(symtab[i][j].st_value); + symtab[i][j].st_size = elf32_to_cpu(symtab[i][j].st_size); + symtab[i][j].st_shndx = elf16_to_cpu(symtab[i][j].st_shndx); + } + } +} + + +static void read_relocs(FILE *fp) +{ + int i,j; + for(i = 0; i < ehdr.e_shnum; i++) { + if (shdr[i].sh_type != SHT_REL) { + continue; + } + reltab[i] = malloc(shdr[i].sh_size); + if (!reltab[i]) { + die("malloc of %d bytes for relocs failed\n", + shdr[i].sh_size); + } + if (fseek(fp, shdr[i].sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + shdr[i].sh_offset, strerror(errno)); + } + if (fread(reltab[i], 1, shdr[i].sh_size, fp) != shdr[i].sh_size) { + die("Cannot read symbol table: %s\n", + strerror(errno)); + } + for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { + reltab[i][j].r_offset = elf32_to_cpu(reltab[i][j].r_offset); + reltab[i][j].r_info = elf32_to_cpu(reltab[i][j].r_info); + } + } +} + + +static void print_absolute_symbols(void) +{ + int i; + printf("Absolute symbols\n"); + printf(" Num: Value Size Type Bind Visibility Name\n"); + for(i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + int j; + if (shdr[i].sh_type != SHT_SYMTAB) { + continue; + } + sh_symtab = symtab[i]; + sym_strtab = strtab[shdr[i].sh_link]; + for(j = 0; j < shdr[i].sh_size/sizeof(symtab[0][0]); j++) { + Elf32_Sym *sym; + const char *name; + sym = &symtab[i][j]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + printf("%5d %08x %5d %10s %10s %12s %s\n", + j, sym->st_value, sym->st_size, + sym_type(ELF32_ST_TYPE(sym->st_info)), + sym_bind(ELF32_ST_BIND(sym->st_info)), + sym_visibility(ELF32_ST_VISIBILITY(sym->st_other)), + name); + } + } + printf("\n"); +} + +static void print_absolute_relocs(void) +{ + int i; + printf("Absolute relocations\n"); + printf("Offset Info Type Sym.Value Sym.Name\n"); + for(i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + unsigned sec_applies, sec_symtab; + int j; + if (shdr[i].sh_type != SHT_REL) { + continue; + } + sec_symtab = shdr[i].sh_link; + sec_applies = shdr[i].sh_info; + if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = symtab[sec_symtab]; + sym_strtab = strtab[shdr[sec_symtab].sh_link]; + for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + const char *name; + rel = &reltab[i][j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + name = sym_name(sym_strtab, sym); + if (sym->st_shndx != SHN_ABS) { + continue; + } + printf("%08x %08x %10s %08x %s\n", + rel->r_offset, + rel->r_info, + rel_type(ELF32_R_TYPE(rel->r_info)), + sym->st_value, + name); + } + } + printf("\n"); +} + +static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) +{ + int i; + /* Walk through the relocations */ + for(i = 0; i < ehdr.e_shnum; i++) { + char *sym_strtab; + Elf32_Sym *sh_symtab; + unsigned sec_applies, sec_symtab; + int j; + if (shdr[i].sh_type != SHT_REL) { + continue; + } + sec_symtab = shdr[i].sh_link; + sec_applies = shdr[i].sh_info; + if (!(shdr[sec_applies].sh_flags & SHF_ALLOC)) { + continue; + } + sh_symtab = symtab[sec_symtab]; + sym_strtab = strtab[shdr[sec_symtab].sh_link]; + for(j = 0; j < shdr[i].sh_size/sizeof(reltab[0][0]); j++) { + Elf32_Rel *rel; + Elf32_Sym *sym; + unsigned r_type; + rel = &reltab[i][j]; + sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; + r_type = ELF32_R_TYPE(rel->r_info); + /* Don't visit relocations to absolute symbols */ + if (sym->st_shndx == SHN_ABS) { + continue; + } + if (r_type == R_386_PC32) { + /* PC relative relocations don't need to be adjusted */ + } + else if (r_type == R_386_32) { + /* Visit relocations that need to be adjusted */ + visit(rel, sym); + } + else { + die("Unsupported relocation type: %d\n", r_type); + } + } + } +} + +static void count_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + reloc_count += 1; +} + +static void collect_reloc(Elf32_Rel *rel, Elf32_Sym *sym) +{ + /* Remember the address that needs to be adjusted. */ + relocs[reloc_idx++] = rel->r_offset; +} + +static int cmp_relocs(const void *va, const void *vb) +{ + const unsigned long *a, *b; + a = va; b = vb; + return (*a == *b)? 0 : (*a > *b)? 1 : -1; +} + +static void emit_relocs(int as_text) +{ + int i; + /* Count how many relocations I have and allocate space for them. */ + reloc_count = 0; + walk_relocs(count_reloc); + relocs = malloc(reloc_count * sizeof(relocs[0])); + if (!relocs) { + die("malloc of %d entries for relocs failed\n", + reloc_count); + } + /* Collect up the relocations */ + reloc_idx = 0; + walk_relocs(collect_reloc); + + /* Order the relocations for more efficient processing */ + qsort(relocs, reloc_count, sizeof(relocs[0]), cmp_relocs); + + /* Print the relocations */ + if (as_text) { + /* Print the relocations in a form suitable that + * gas will like. + */ + printf(".section \".data.reloc\",\"a\"\n"); + printf(".balign 4\n"); + for(i = 0; i < reloc_count; i++) { + printf("\t .long 0x%08lx\n", relocs[i]); + } + printf("\n"); + } + else { + unsigned char buf[4]; + buf[0] = buf[1] = buf[2] = buf[3] = 0; + /* Print a stop */ + printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + /* Now print each relocation */ + for(i = 0; i < reloc_count; i++) { + buf[0] = (relocs[i] >> 0) & 0xff; + buf[1] = (relocs[i] >> 8) & 0xff; + buf[2] = (relocs[i] >> 16) & 0xff; + buf[3] = (relocs[i] >> 24) & 0xff; + printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); + } + } +} + +static void usage(void) +{ + die("i386_reloc [--abs | --text] vmlinux\n"); +} + +int main(int argc, char **argv) +{ + int show_absolute; + int as_text; + const char *fname; + FILE *fp; + int i; + + show_absolute = 0; + as_text = 0; + fname = NULL; + for(i = 1; i < argc; i++) { + char *arg = argv[i]; + if (*arg == '-') { + if (strcmp(argv[1], "--abs") == 0) { + show_absolute = 1; + continue; + } + else if (strcmp(argv[1], "--text") == 0) { + as_text = 1; + continue; + } + } + else if (!fname) { + fname = arg; + continue; + } + usage(); + } + if (!fname) { + usage(); + } + fp = fopen(fname, "r"); + if (!fp) { + die("Cannot open %s: %s\n", + fname, strerror(errno)); + } + read_ehdr(fp); + read_shdrs(fp); + read_strtabs(fp); + read_symtabs(fp); + read_relocs(fp); + if (show_absolute) { + print_absolute_symbols(); + print_absolute_relocs(); + return 0; + } + emit_relocs(as_text); + return 0; +} diff --git a/arch/i386/boot/compressed/vmlinux.lds b/arch/i386/boot/compressed/vmlinux.lds new file mode 100644 index 000000000000..cc4854f6c6c1 --- /dev/null +++ b/arch/i386/boot/compressed/vmlinux.lds @@ -0,0 +1,43 @@ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(startup_32) +SECTIONS +{ + /* Be careful parts of head.S assume startup_32 is at + * address 0. + */ + . = 0 ; + .text.head : { + _head = . ; + *(.text.head) + _ehead = . ; + } + .data.compressed : { + *(.data.compressed) + } + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + _end = . ; + } +} diff --git a/arch/i386/boot/compressed/vmlinux.scr b/arch/i386/boot/compressed/vmlinux.scr index 1ed9d791f863..707a88f7f29e 100644 --- a/arch/i386/boot/compressed/vmlinux.scr +++ b/arch/i386/boot/compressed/vmlinux.scr @@ -1,9 +1,10 @@ SECTIONS { - .data : { + .data.compressed : { input_len = .; LONG(input_data_end - input_data) input_data = .; *(.data) + output_len = . - 4; input_data_end = .; } } diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index 3aec4538a113..9aa8b0518184 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -588,11 +588,6 @@ rmodeswtch_normal: call default_switch rmodeswtch_end: -# we get the code32 start address and modify the below 'jmpi' -# (loader may have changed it) - movl %cs:code32_start, %eax - movl %eax, %cs:code32 - # Now we move the system to its rightful place ... but we check if we have a # big-kernel. In that case we *must* not move it ... testb $LOADED_HIGH, %cs:loadflags @@ -788,11 +783,12 @@ a20_err_msg: a20_done: #endif /* CONFIG_X86_VOYAGER */ -# set up gdt and idt +# set up gdt and idt and 32bit start address lidt idt_48 # load idt with 0,0 xorl %eax, %eax # Compute gdt_base movw %ds, %ax # (Convert %ds:gdt to a linear ptr) shll $4, %eax + addl %eax, code32 addl $gdt, %eax movl %eax, (gdt_48+2) lgdt gdt_48 # load gdt with whatever is @@ -851,9 +847,26 @@ flush_instr: # Manual, Mixing 16-bit and 32-bit code, page 16-6) .byte 0x66, 0xea # prefix + jmpi-opcode -code32: .long 0x1000 # will be set to 0x100000 - # for big kernels +code32: .long startup_32 # will be set to %cs+startup_32 .word __BOOT_CS +.code32 +startup_32: + movl $(__BOOT_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + + xorl %eax, %eax +1: incl %eax # check that A20 really IS enabled + movl %eax, 0x00000000 # loop forever if it isn't + cmpl %eax, 0x00100000 + je 1b + + # Jump to the 32bit entry point + jmpl *(code32_start - start + (DELTA_INITSEG << 4))(%esi) +.code16 # Here's a bunch of information about your current kernel.. kernel_version: .ascii UTS_RELEASE -- cgit v1.2.1 From 6a044b3a0a1829ef19bb29548ffe553f48e8d80c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Warn upon absolute relocations being present o Relocations generated w.r.t absolute symbols are not processed as by definition, absolute symbols are not to be relocated. Explicitly warn user about absolutions relocations present at compile time. o These relocations get introduced either due to linker optimizations or some programming oversights. o Also create a list of symbols which have been audited to be safe and don't emit warnings for these. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/boot/compressed/Makefile | 2 +- arch/i386/boot/compressed/relocs.c | 82 +++++++++++++++++++++++++++++++++----- 2 files changed, 73 insertions(+), 11 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/boot/compressed/Makefile b/arch/i386/boot/compressed/Makefile index cc28da3a881e..a661217f33ec 100644 --- a/arch/i386/boot/compressed/Makefile +++ b/arch/i386/boot/compressed/Makefile @@ -20,7 +20,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@ + cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE $(call if_changed,relocs) diff --git a/arch/i386/boot/compressed/relocs.c b/arch/i386/boot/compressed/relocs.c index 0551ceb21bed..468da89153c4 100644 --- a/arch/i386/boot/compressed/relocs.c +++ b/arch/i386/boot/compressed/relocs.c @@ -19,6 +19,33 @@ static char *strtab[MAX_SHDRS]; static unsigned long reloc_count, reloc_idx; static unsigned long *relocs; +/* + * Following symbols have been audited. There values are constant and do + * not change if bzImage is loaded at a different physical address than + * the address for which it has been compiled. Don't warn user about + * absolute relocations present w.r.t these symbols. + */ +static const char* safe_abs_relocs[] = { + "__kernel_vsyscall", + "__kernel_rt_sigreturn", + "__kernel_sigreturn", + "SYSENTER_RETURN", +}; + +static int is_safe_abs_reloc(const char* sym_name) +{ + int i, array_size; + + array_size = sizeof(safe_abs_relocs)/sizeof(char*); + + for(i = 0; i < array_size; i++) { + if (!strcmp(sym_name, safe_abs_relocs[i])) + /* Match found */ + return 1; + } + return 0; +} + static void die(char *fmt, ...) { va_list ap; @@ -359,9 +386,8 @@ static void print_absolute_symbols(void) static void print_absolute_relocs(void) { - int i; - printf("Absolute relocations\n"); - printf("Offset Info Type Sym.Value Sym.Name\n"); + int i, printed = 0; + for(i = 0; i < ehdr.e_shnum; i++) { char *sym_strtab; Elf32_Sym *sh_symtab; @@ -387,6 +413,31 @@ static void print_absolute_relocs(void) if (sym->st_shndx != SHN_ABS) { continue; } + + /* Absolute symbols are not relocated if bzImage is + * loaded at a non-compiled address. Display a warning + * to user at compile time about the absolute + * relocations present. + * + * User need to audit the code to make sure + * some symbols which should have been section + * relative have not become absolute because of some + * linker optimization or wrong programming usage. + * + * Before warning check if this absolute symbol + * relocation is harmless. + */ + if (is_safe_abs_reloc(name)) + continue; + + if (!printed) { + printf("WARNING: Absolute relocations" + " present\n"); + printf("Offset Info Type Sym.Value " + "Sym.Name\n"); + printed = 1; + } + printf("%08x %08x %10s %08x %s\n", rel->r_offset, rel->r_info, @@ -395,7 +446,9 @@ static void print_absolute_relocs(void) name); } } - printf("\n"); + + if (printed) + printf("\n"); } static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) @@ -508,25 +561,31 @@ static void emit_relocs(int as_text) static void usage(void) { - die("i386_reloc [--abs | --text] vmlinux\n"); + die("relocs [--abs-syms |--abs-relocs | --text] vmlinux\n"); } int main(int argc, char **argv) { - int show_absolute; + int show_absolute_syms, show_absolute_relocs; int as_text; const char *fname; FILE *fp; int i; - show_absolute = 0; + show_absolute_syms = 0; + show_absolute_relocs = 0; as_text = 0; fname = NULL; for(i = 1; i < argc; i++) { char *arg = argv[i]; if (*arg == '-') { - if (strcmp(argv[1], "--abs") == 0) { - show_absolute = 1; + if (strcmp(argv[1], "--abs-syms") == 0) { + show_absolute_syms = 1; + continue; + } + + if (strcmp(argv[1], "--abs-relocs") == 0) { + show_absolute_relocs = 1; continue; } else if (strcmp(argv[1], "--text") == 0) { @@ -553,8 +612,11 @@ int main(int argc, char **argv) read_strtabs(fp); read_symtabs(fp); read_relocs(fp); - if (show_absolute) { + if (show_absolute_syms) { print_absolute_symbols(); + return 0; + } + if (show_absolute_relocs) { print_absolute_relocs(); return 0; } -- cgit v1.2.1 From e69f202d0a1419219198566e1c22218a5c71a9a6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Implement CONFIG_PHYSICAL_ALIGN o Now CONFIG_PHYSICAL_START is being replaced with CONFIG_PHYSICAL_ALIGN. Hardcoding the kernel physical start value creates a problem in relocatable kernel context due to boot loader limitations. For ex, if somebody compiles a relocatable kernel to be run from address 4MB, but this kernel will run from location 1MB as grub loads the kernel at physical address 1MB. Kernel thinks that I am a relocatable kernel and I should run from the address I have been loaded at. So somebody wanting to run kernel from 4MB alignment location (for improved performance regions) can't do that. o Hence, Eric proposed that probably CONFIG_PHYSICAL_ALIGN will make more sense in relocatable kernel context. At run time kernel will move itself to a physical addr location which meets user specified alignment restrictions. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 33 ++++++++++++++++++--------------- arch/i386/boot/compressed/head.S | 26 ++++++++++++++------------ arch/i386/boot/compressed/misc.c | 7 ++++--- arch/i386/kernel/vmlinux.lds.S | 3 ++- 4 files changed, 38 insertions(+), 31 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index d588ca874bb4..fd2fa7a7ec52 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -785,23 +785,26 @@ config RELOCATABLE must live at a different physical address than the primary kernel. -config PHYSICAL_START - hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - - default "0x1000000" if CRASH_DUMP +config PHYSICAL_ALIGN + hex "Alignment value to which kernel should be aligned" default "0x100000" + range 0x2000 0x400000 help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x100000 (1MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + This value puts the alignment restrictions on physical address + where kernel is loaded and run from. Kernel is compiled for an + address which meets above alignment restriction. + + If bootloader loads the kernel at a non-aligned address and + CONFIG_RELOCATABLE is set, kernel will move itself to nearest + address aligned to above value and run from there. + + If bootloader loads the kernel at a non-aligned address and + CONFIG_RELOCATABLE is not set, kernel will ignore the run time + load address and decompress itself to the address it has been + compiled for and run from there. The address for which kernel is + compiled already meets above alignment restrictions. Hence the + end result is that kernel runs from a physical address meeting + above alignment restrictions. Don't change this unless you know what you are doing. diff --git a/arch/i386/boot/compressed/head.S b/arch/i386/boot/compressed/head.S index e4dd7a6b9b0f..f395a4bb38bb 100644 --- a/arch/i386/boot/compressed/head.S +++ b/arch/i386/boot/compressed/head.S @@ -26,6 +26,7 @@ #include #include #include +#include .section ".text.head" .globl startup_32 @@ -52,17 +53,17 @@ startup_32: 1: popl %ebp subl $1b, %ebp -/* Compute the delta between where we were compiled to run at - * and where the code will actually run at. +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. */ - /* Start with the delta to where the kernel will run at. If we are - * a relocatable kernel this is the delta to our load address otherwise - * this is the delta to CONFIG_PHYSICAL start. - */ + #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + movl %ebp, %ebx + addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx + andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx #else - movl $(CONFIG_PHYSICAL_START - startup_32), %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Replace the compressed data size with the uncompressed size */ @@ -94,9 +95,10 @@ startup_32: /* Compute the kernel start address. */ #ifdef CONFIG_RELOCATABLE - leal startup_32(%ebp), %ebp + addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp + andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp #else - movl $CONFIG_PHYSICAL_START, %ebp + movl $LOAD_PHYSICAL_ADDR, %ebp #endif /* @@ -150,8 +152,8 @@ relocated: * and where it was actually loaded. */ movl %ebp, %ebx - subl $CONFIG_PHYSICAL_START, %ebx - + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 4eac24e95a10..dc1538931555 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -14,6 +14,7 @@ #include #include #include +#include /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -360,12 +361,12 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end, insize = input_len; inptr = 0; - if (((u32)output - CONFIG_PHYSICAL_START) & 0x3fffff) - error("Destination address not 4M aligned"); + if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) + error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) error("Destination address too large"); #ifndef CONFIG_RELOCATABLE - if ((u32)output != CONFIG_PHYSICAL_START) + if ((u32)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index f8d61ec4c8cb..6860f20aa579 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -14,6 +14,7 @@ #include #include #include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -27,7 +28,7 @@ PHDRS { } SECTIONS { - . = LOAD_OFFSET + CONFIG_PHYSICAL_START; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { -- cgit v1.2.1 From be274eeaf20b4c7155242645d5e2c48b023e609b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: extend bzImage protocol for relocatable protected mode kernel Extend bzImage protocol to enable bootloaders to load a completely relocatable bzImage. Now protected mode component of kernel is also relocatable and a boot-loader can load the protected mode component at a differnt physical address than 1MB. (If kernel was built with CONFIG_RELOCATABLE) Kexec can make use of it to load this kernel at a different physical address to capture kernel crash dumps. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/boot/setup.S | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/boot/setup.S b/arch/i386/boot/setup.S index 9aa8b0518184..06edf1c66242 100644 --- a/arch/i386/boot/setup.S +++ b/arch/i386/boot/setup.S @@ -81,7 +81,7 @@ start: # This is the setup header, and it must start at %cs:2 (old 0x9020:2) .ascii "HdrS" # header signature - .word 0x0204 # header version number (>= 0x0105) + .word 0x0205 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) realmode_swtch: .word 0, 0 # default_switch, SETUPSEG start_sys_seg: .word SYSSEG @@ -160,6 +160,17 @@ ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff # The highest safe address for # the contents of an initrd +kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment + #required for protected mode + #kernel +#ifdef CONFIG_RELOCATABLE +relocatable_kernel: .byte 1 +#else +relocatable_kernel: .byte 0 +#endif +pad2: .byte 0 +pad3: .word 0 + trampoline: call start_of_setup .align 16 # The offset at this point is 0x240 -- cgit v1.2.1 From 4c7aa6c3b25ef96bc1b723238041195e3d8bf047 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Mark CONFIG_RELOCATABLE EXPERIMENTAL Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index fd2fa7a7ec52..1f0f7b60995e 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -774,7 +774,8 @@ config CRASH_DUMP For more details see Documentation/kdump/kdump.txt config RELOCATABLE - bool "Build a relocatable kernel" + bool "Build a relocatable kernel(EXPERIMENTAL)" + depends on EXPERIMENTAL help This build a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. -- cgit v1.2.1 From 74b47a7844501445d41d704fe7c626f4b1819508 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Thu, 7 Dec 2006 02:14:04 +0100 Subject: [PATCH] i386: Fix entry.S code with !CONFIG_VM86 The entry.S code at work_notifysig is surely wrong. It drops into unrelated code if the branch to work_notifysig_v86 is taken, and CONFIG_VM86=n. [PATCH] Make vm86 support optional tree 9b5daef5280800a0006343a17f63072658d91a1d pushed to git Jan 8, 2006, and first appears in 2.6.16 The 'fix' here is to also compile out the vm86 test & branch when CONFIG_VM86=n. Signed-off-by: Joe Korty Signed-off-by: Andi Kleen --- arch/i386/kernel/entry.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d7423efaeea4..0220bc8cbb43 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -457,6 +457,7 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests +#ifdef CONFIG_VM86 testl $VM_MASK, PT_EFLAGS(%esp) movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or @@ -467,17 +468,18 @@ work_notifysig: # deal with pending signals and ALIGN work_notifysig_v86: -#ifdef CONFIG_VM86 pushl %ecx # save ti_flags for do_notify_resume CFI_ADJUST_CFA_OFFSET 4 call save_v86_state # %eax contains pt_regs pointer popl %ecx CFI_ADJUST_CFA_OFFSET -4 movl %eax, %esp +#else + movl %esp, %eax +#endif xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig -#endif # perform syscall exit tracing ALIGN -- cgit v1.2.1 From 770d132f03ac15b12919f1bac481f4beda13e094 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:05 +0100 Subject: [PATCH] i386: Retrieve CLFLUSH size from CPUID Also report it in /proc/cpuinfo similar to x86-64. Needed for followon patch Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/common.c | 3 +++ arch/i386/kernel/cpu/proc.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 6958ae5e2fa5..cda41aef79ad 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -309,6 +309,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) #else c->apicid = (ebx >> 24) & 0xFF; #endif + if (c->x86_capability[0] & (1<<19)) + c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; @@ -373,6 +375,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; + c->x86_clflush_size = 32; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 76aac088a323..6624d8583c42 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, " [%d]", i); } - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); + seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size); return 0; } -- cgit v1.2.1 From 3760dd6efa75c98e223643da2eb7040406433053 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:05 +0100 Subject: [PATCH] i386: Use CLFLUSH instead of WBINVD in change_page_attr CLFLUSH is a lot faster than WBINVD so try to use that. Signed-off-by: Andi Kleen --- arch/i386/mm/pageattr.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 8564b6ae17e3..ad91528bdc14 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -67,11 +67,17 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, return base; } -static void flush_kernel_map(void *dummy) +static void flush_kernel_map(void *arg) { - /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ - if (boot_cpu_data.x86_model >= 4) + unsigned long adr = (unsigned long)arg; + + if (adr && cpu_has_clflush) { + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (adr + i)); + } else if (boot_cpu_data.x86_model >= 4) wbinvd(); + /* Flush all to work around Errata in early athlons regarding * large page flushing. */ @@ -173,9 +179,9 @@ __change_page_attr(struct page *page, pgprot_t prot) return 0; } -static inline void flush_map(void) +static inline void flush_map(void *adr) { - on_each_cpu(flush_kernel_map, NULL, 1, 1); + on_each_cpu(flush_kernel_map, adr, 1, 1); } /* @@ -217,9 +223,13 @@ void global_flush_tlb(void) spin_lock_irq(&cpa_lock); list_replace_init(&df_list, &l); spin_unlock_irq(&cpa_lock); - flush_map(); - list_for_each_entry_safe(pg, next, &l, lru) + if (!cpu_has_clflush) + flush_map(0); + list_for_each_entry_safe(pg, next, &l, lru) { + if (cpu_has_clflush) + flush_map(page_address(pg)); __free_page(pg); + } } #ifdef CONFIG_DEBUG_PAGEALLOC -- cgit v1.2.1 From 11a4180c0b03e2ee0c948fd8430ee092dc1625b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Use probe_kernel_address instead of __get_user in fault paths Makes the intention of the code cleaner to read and avoids a potential deadlock on mmap_sem. Also change the types of the arguments to not include __user because they're really not user addresses. Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 24 +++++++++++++----------- arch/i386/mm/fault.c | 12 ++++++------ 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 237f4884a1e1..7b2f9f022089 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -380,7 +380,7 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - u8 __user *eip; + u8 *eip; int code_bytes = 64; unsigned char c; @@ -389,18 +389,20 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Code: "); - eip = (u8 __user *)regs->eip - 43; - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + eip = (u8 *)regs->eip - 43; + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { /* try starting at EIP */ - eip = (u8 __user *)regs->eip; + eip = (u8 *)regs->eip; code_bytes = 32; } for (i = 0; i < code_bytes; i++, eip++) { - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + if (eip < (u8 *)PAGE_OFFSET || + probe_kernel_address(eip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 __user *)regs->eip) + if (eip == (u8 *)regs->eip) printk("<%02x> ", c); else printk("%02x ", c); @@ -416,7 +418,7 @@ static void handle_BUG(struct pt_regs *regs) if (eip < PAGE_OFFSET) return; - if (probe_kernel_address((unsigned short __user *)eip, ud2)) + if (probe_kernel_address((unsigned short *)eip, ud2)) return; if (ud2 != 0x0b0f) return; @@ -429,11 +431,11 @@ static void handle_BUG(struct pt_regs *regs) char *file; char c; - if (probe_kernel_address((unsigned short __user *)(eip + 2), - line)) + if (probe_kernel_address((unsigned short *)(eip + 2), line)) break; - if (__get_user(file, (char * __user *)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + if (probe_kernel_address((char **)(eip + 4), file) || + (unsigned long)file < PAGE_OFFSET || + probe_kernel_address(file, c)) file = ""; printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index 2581575786c1..aaaa4d225f7e 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -22,9 +22,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -167,7 +167,7 @@ static inline unsigned long get_segment_eip(struct pt_regs *regs, static int __is_prefetch(struct pt_regs *regs, unsigned long addr) { unsigned long limit; - unsigned long instr = get_segment_eip (regs, &limit); + unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); int scan_more = 1; int prefetch = 0; int i; @@ -177,9 +177,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) unsigned char instr_hi; unsigned char instr_lo; - if (instr > limit) + if (instr > (unsigned char *)limit) break; - if (__get_user(opcode, (unsigned char __user *) instr)) + if (probe_kernel_address(instr, opcode)) break; instr_hi = opcode & 0xf0; @@ -204,9 +204,9 @@ static int __is_prefetch(struct pt_regs *regs, unsigned long addr) case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (instr > limit) + if (instr > (unsigned char *)limit) break; - if (__get_user(opcode, (unsigned char __user *) instr)) + if (probe_kernel_address(instr, opcode)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); -- cgit v1.2.1 From 269c2d81ed66af7c09a1619ffe165f03e7470a5b Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: i386 create e820.c to handle standard io/mem resources This patch creates new file named e820.c to hanle standard io/mem resources, moving request_standard_resources function from setup.c to e820.c. Also this patch modifies Makfile to compile file e820.c. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen Makefile | 2 arch/i386/kernel/Makefile | 2 arch/i386/kernel/e820.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 276 ------------------------------------------- 3 files changed, 293 insertions(+), 274 deletions(-) --- arch/i386/kernel/Makefile | 2 +- arch/i386/kernel/e820.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 276 +------------------------------------------ 3 files changed, 293 insertions(+), 274 deletions(-) create mode 100644 arch/i386/kernel/e820.c (limited to 'arch/i386') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 1a884b6e6e5c..f614854bd71f 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ - pci-dma.o i386_ksyms.o i387.o bootflag.o \ + pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c new file mode 100644 index 000000000000..cce706049488 --- /dev/null +++ b/arch/i386/kernel/e820.c @@ -0,0 +1,289 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +struct e820map e820; +struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) + +static int __init romchecksum(unsigned char *rom, unsigned long length) +{ + unsigned char *p, sum = 0; + + for (p = rom; p < rom + length; p++) + sum += *p; + return sum == 0; +} + +static void __init probe_roms(void) +{ + unsigned long start, length, upper; + unsigned char *rom; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = rom[2] * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +{ + int i; + + probe_roms(); + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; +#ifndef CONFIG_RESOURCES_64BIT + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; +#endif + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, code_resource); + request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif + } + } +} + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + printk("Setting up standard PCI resources\n"); + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(&code_resource, &data_resource); + + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 61539afbdf23..acd2d9392abb 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -76,11 +76,9 @@ int disable_pse __devinitdata = 0; /* * Machine setup.. */ - -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif +extern struct e820map e820; +extern struct resource code_resource; +extern struct resource data_resource; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -134,7 +132,6 @@ struct ist_info ist_info; defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) EXPORT_SYMBOL(ist_info); #endif -struct e820map e820; extern void early_cpu_init(void); extern int root_mountflags; @@ -149,203 +146,6 @@ static char command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; -static struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - static void __init limit_regions(unsigned long long size) { unsigned long long current_addr = 0; @@ -1200,77 +1000,7 @@ void __init remapped_pgdat_init(void) } } -/* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) -{ - int i; - - probe_roms(); - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; -#ifndef CONFIG_RESOURCES_64BIT - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; -#endif - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res)) { - kfree(res); - continue; - } - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, code_resource); - request_resource(res, data_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); - else - legacy_init_iomem_resources(&code_resource, &data_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} -subsys_initcall(request_standard_resources); static void __init register_memory(void) { -- cgit v1.2.1 From 8e3342f736dd1c19ce7c28625dedd7d8730fc7ad Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: create e820.c for e820 map sanitize and copy function This patch moves bios e820 map sanitize and copy function from setup.c to e820.c Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 240 -------------------------------------------- 2 files changed, 252 insertions(+), 240 deletions(-) --- arch/i386/kernel/e820.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 240 -------------------------------------------- 2 files changed, 252 insertions(+), 240 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index cce706049488..0db95760b073 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -19,6 +19,14 @@ EXPORT_SYMBOL(efi_enabled); #endif struct e820map e820; +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -287,3 +295,247 @@ static int __init request_standard_resources(void) } subsys_initcall(request_standard_resources); + +void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + printk("sanitize start\n"); + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) { + printk("sanitize bail 0\n"); + return -1; + } + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; iaddr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if > , swap */ + /* or, if current= & last=, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; ipbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; itype > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + printk("sanitize end\n"); + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + printk("copy_e820_map() type is E820_RAM\n"); + if (start < 0x100000ULL && end > 0xA0000ULL) { + printk("copy_e820_map() lies in range...\n"); + if (start < 0xA0000ULL) { + printk("copy_e820_map() start < 0xA0000ULL\n"); + add_memory_region(start, 0xA0000ULL-start, type); + } + if (end <= 0x100000ULL) { + printk("copy_e820_map() end <= 0x100000ULL\n"); + continue; + } + start = 0x100000ULL; + size = end - start; + } + } + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index acd2d9392abb..b7509aec0eb1 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -191,26 +191,6 @@ static void __init limit_regions(unsigned long long size) } } -void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; - } -} /* add_memory_region */ - #define E820_DEBUG 1 static void __init print_memory_map(char *who) @@ -239,226 +219,6 @@ static void __init print_memory_map(char *who) } } -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -static struct change_member change_point_list[2*E820MAX] __initdata; -static struct change_member *change_point[2*E820MAX] __initdata; -static struct e820entry *overlap_list[E820MAX] __initdata; -static struct e820entry new_bios[E820MAX] __initdata; - -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; iaddr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if > , swap */ - /* or, if current= & last=, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; ipbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; itype > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - unsigned long long start = biosmap->addr; - unsigned long long size = biosmap->size; - unsigned long long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -} - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE -- cgit v1.2.1 From b2dff6a88cbed59d787a8ca7367c76ba385e1187 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move find_max_pfn function to e820.c Move more code from setup.c into e820.c Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 55 ------------------------------------------------ 2 files changed, 52 insertions(+), 55 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 0db95760b073..be4934f6f85b 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -539,3 +540,54 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return 0; } +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, PFN_UP(start), PFN_DOWN(end)); + return 0; +} + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + memory_present(0, start, end); + } +} diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index b7509aec0eb1..3d808054fdf7 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -63,9 +63,6 @@ #include #include -/* Forward Declaration. */ -void __init find_max_pfn(void); - /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ @@ -387,29 +384,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); -/* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - /* * This function checks if the entire range is mapped with type. * @@ -442,35 +416,6 @@ e820_all_mapped(unsigned long s, unsigned long e, unsigned type) return 0; } -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - memory_present(0, start, end); - } -} - /* * Determine low and high memory ranges: */ -- cgit v1.2.1 From b5b2405706005cc7765f6ecd00965d29e93f090a Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move e820/efi memmap walking code to e820.c This patch moves e820/efi memmap table walking function from setup.c to e820.c, also this patch adds extern declaration in header file. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------- include/asm-i386/e820.h | 2 arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------------------- include/asm-i386/e820.h | 2 3 files changed, 117 insertions(+), 118 deletions(-) --- arch/i386/kernel/e820.c | 115 +++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 118 ----------------------------------------------- 2 files changed, 115 insertions(+), 118 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index be4934f6f85b..47c495bf0cbc 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -28,6 +28,11 @@ static struct change_member change_point_list[2*E820MAX] __initdata; static struct change_member *change_point[2*E820MAX] __initdata; static struct e820entry *overlap_list[E820MAX] __initdata; static struct e820entry new_bios[E820MAX] __initdata; +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -591,3 +596,113 @@ void __init find_max_pfn(void) memory_present(0, start, end); } } + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= (max_low_pfn << PAGE_SHIFT)) + return 0; + if (end >= (max_low_pfn << PAGE_SHIFT)) + end = max_low_pfn << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +void __init register_memory(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 3d808054fdf7..51ed015a1f35 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -94,12 +94,6 @@ unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -475,68 +469,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -/* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -static void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - /* * workaround for Dell systems that neglect to reserve EBDA */ @@ -705,56 +637,6 @@ void __init remapped_pgdat_init(void) } } - - -static void __init register_memory(void) -{ - unsigned long gapstart, gapsize, round; - unsigned long long last; - int i; - - /* - * Search for the bigest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. - */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - #ifdef CONFIG_MCA static void set_mca_bus(int x) { -- cgit v1.2.1 From cef518e88b8ed94ea483c436ef5e5b151a3fabc6 Mon Sep 17 00:00:00 2001 From: "bibo,mao" Date: Thu, 7 Dec 2006 02:14:06 +0100 Subject: [PATCH] i386: Move memory map printing and other code to e820.c This patch moves e820 memory map print and memmap boot param parsing function from setup.c to e820.c, also adds limit_regions and print_memory_map declaration in header file. Signed-off-by: bibo,mao Signed-off-by: Andi Kleen arch/i386/kernel/e820.c | 152 +++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 158 --------------------------------- include/asm-i386/e820.h | 2 arch/i386/kernel/e820.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 153 ----------------------------------------------- include/asm-i386/e820.h | 2 3 files changed, 155 insertions(+), 152 deletions(-) --- arch/i386/kernel/e820.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 153 +---------------------------------------------- 2 files changed, 153 insertions(+), 152 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 47c495bf0cbc..b755255f2721 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -33,6 +33,7 @@ unsigned long pci_mem_start = 0x10000000; #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif +extern int user_defined_memmap; struct resource data_resource = { .name = "Kernel data", .start = 0, @@ -706,3 +707,154 @@ void __init register_memory(void) printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", pci_mem_start, gapstart, gapsize); } + +void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %lu\n", e820.map[i].type); + break; + } + } +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + int i; + + print_memory_map("limit_regions start"); + if (efi_enabled) { + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map, i = 0; p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + current_addr = md->phys_addr + (md->num_pages << 12); + if (md->type == EFI_CONVENTIONAL_MEMORY) { + if (current_addr >= size) { + md->num_pages -= + (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); + memmap.nr_map = i + 1; + return; + } + } + } + } + for (i = 0; i < e820.nr_map; i++) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + print_memory_map("limit_regions endfor"); + return; + } + print_memory_map("limit_regions endfunc"); +} + + /* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) +{ + u64 start = s; + u64 end = e; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; + } + } + return 0; +} +early_param("memmap", parse_memmap); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 51ed015a1f35..e5bb87aa5a45 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -73,7 +73,6 @@ int disable_pse __devinitdata = 0; /* * Machine setup.. */ -extern struct e820map e820; extern struct resource code_resource; extern struct resource data_resource; @@ -137,79 +136,6 @@ static char command_line[COMMAND_LINE_SIZE]; unsigned char __initdata boot_params[PARAM_SIZE]; -static void __init limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - int i; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map, i = 0; p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - current_addr = md->phys_addr + (md->num_pages << 12); - if (md->type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - md->num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } - } - for (i = 0; i < e820.nr_map; i++) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr < size) - continue; - - if (e820.map[i].type != E820_RAM) - continue; - - if (e820.map[i].addr >= size) { - /* - * This region starts past the end of the - * requested size, skip it completely. - */ - e820.nr_map = i; - } else { - e820.nr_map = i + 1; - e820.map[i].size -= current_addr - size; - } - return; - } -} - -#define E820_DEBUG 1 - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %lu\n", e820.map[i].type); - break; - } - } -} - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE @@ -233,7 +159,7 @@ static inline void copy_edd(void) } #endif -static int __initdata user_defined_memmap = 0; +int __initdata user_defined_memmap = 0; /* * "mem=nopentium" disables the 4MB page tables. @@ -270,51 +196,6 @@ static int __init parse_mem(char *arg) } early_param("mem", parse_mem); -static int __init parse_memmap(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp(arg, "exactmap") == 0) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - e820.nr_map = 0; - user_defined_memmap = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(arg, &arg); - if (*arg == '@') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*arg == '#') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*arg == '$') { - start_at = memparse(arg+1, &arg); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - user_defined_memmap = 1; - } - } - return 0; -} -early_param("memmap", parse_memmap); - #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. @@ -378,38 +259,6 @@ static int __init parse_reservetop(char *arg) } early_param("reservetop", parse_reservetop); - /* - * This function checks if the entire range is mapped with type. - * - * Note: this function only works correct if the e820 table is sorted and - * not-overlapping, which is the case - */ -int __init -e820_all_mapped(unsigned long s, unsigned long e, unsigned type) -{ - u64 start = s; - u64 end = e; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ - if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - /* if the region is at the beginning of we move - * start to the end of the region since it's ok until there - */ - if (ei->addr <= start) - start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full - * coverage */ - if (start >= end) - return 1; /* we're done */ - } - return 0; -} - /* * Determine low and high memory ranges: */ -- cgit v1.2.1 From d15512f442ef1ea60f6195b0444fb27b3cf8d0e6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: Fix race in IO-APIC routing entry setup. Interrupt could happen between setting the IO-APIC entry and setting its interrupt data. Pointed out by Linus. Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 3b7a63e0ed1a..e33b7a845299 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -153,14 +153,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) * the interrupt, and we need to make sure the entry is fully populated * before that happens. */ -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; union entry_union eu; eu.entry = e; - spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x11 + 2*pin, eu.w2); io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1360,8 +1366,8 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } - ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2856,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a if (!ioapic && (irq < 16)) disable_8259A_irq(irq); - ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(ioapic, pin, entry); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.1 From 8c89812684de3b47066d800031dfd7098abbdc74 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: remove IOPL check on task switch IOPL is implicitly saved and restored on task switch, so explicit check is no longer needed. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/process.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 8f42659ef9d2..99308510a17c 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -674,12 +674,6 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas write_pda(pcurrent, next_p); - /* - * Restore IOPL if needed. - */ - if (unlikely(prev->iopl != next->iopl)) - set_iopl_mask(next->iopl); - /* * Now maybe handle debug registers and/or IO bitmaps */ -- cgit v1.2.1 From 3529833f1ca8790df06ce218b5d9d438776696ed Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: substitute __va lookup with pfn_to_kaddr Substitutes allocate_pgdat virtual address lookup with pfn_to_kaddr macro. Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/mm/discontig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index ddbdb0336f28..103b76e56a94 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -168,7 +168,7 @@ static void __init allocate_pgdat(int nid) if (nid && node_has_online_mem(nid)) NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; else { - NODE_DATA(nid) = (pg_data_t *)(__va(min_low_pfn << PAGE_SHIFT)); + NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); min_low_pfn += PFN_UP(sizeof(pg_data_t)); } } -- cgit v1.2.1 From db91b882aabd0b3b55a87cbfb344f2798bb740b4 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] i386: Fix double #includes in arch/i386 Fix double #includes in arch/i386 Signed-off-by: Nicolas Kaiser Signed-off-by: Andi Kleen --- arch/i386/kernel/cpuid.c | 1 - arch/i386/kernel/tsc.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index ab0c327e79dc..5c5d4507ee7d 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index fbc95828cd74..7f22e03253e2 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -13,7 +13,6 @@ #include #include -#include #include #include "mach_timer.h" -- cgit v1.2.1 From d3561b7fa0fb0fc583bab0eeda32bec9e4c4056d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:07 +0100 Subject: [PATCH] paravirt: header and stubs for paravirtualisation Create a paravirt.h header for all the critical operations which need to be replaced with hypervisor calls, and include that instead of defining native operations, when CONFIG_PARAVIRT. This patch does the dumbest possible replacement of paravirtualized instructions: calls through a "paravirt_ops" structure. Currently these are function implementations of native hardware: hypervisors will override the ops structure with their own variants. All the pv-ops functions are declared "fastcall" so that a specific register-based ABI is used, to make inlining assember easier. And: +From: Andy Whitcroft The paravirt ops introduce a 'weak' attribute onto memory_setup(). Code ordering leads to the following warnings on x86: arch/i386/kernel/setup.c:651: warning: weak declaration of `memory_setup' after first use results in unspecified behavior Move memory_setup() to avoid this. Signed-off-by: Rusty Russell Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Andy Whitcroft --- arch/i386/Kconfig | 11 ++ arch/i386/boot/compressed/misc.c | 1 + arch/i386/kernel/Makefile | 1 + arch/i386/kernel/asm-offsets.c | 10 + arch/i386/kernel/entry.S | 34 +++- arch/i386/kernel/i8259.c | 5 +- arch/i386/kernel/paravirt.c | 404 +++++++++++++++++++++++++++++++++++++++ arch/i386/kernel/setup.c | 8 +- arch/i386/kernel/smpboot.c | 5 + arch/i386/kernel/time.c | 15 +- arch/i386/power/cpu.c | 8 +- 11 files changed, 479 insertions(+), 23 deletions(-) create mode 100644 arch/i386/kernel/paravirt.c (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 1f0f7b60995e..bb1fa061c6cf 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -182,6 +182,17 @@ config X86_ES7000 endchoice +config PARAVIRT + bool "Paravirtualization support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Paravirtualization is a way of running multiple instances of + Linux on the same machine, under a hypervisor. This option + changes the kernel so it can modify itself when it is run + under a hypervisor, improving performance significantly. + However, when run without a hypervisor the kernel is + theoretically slower. If in doubt, say N. + config ACPI_SRAT bool default y diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index dc1538931555..c6798c75c67d 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -9,6 +9,7 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#undef CONFIG_PARAVIRT #include #include #include diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index f614854bd71f..406612136049 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 0666eb0ed7bc..1b2f3cd33270 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -101,4 +101,14 @@ void foo(void) BLANK(); OFFSET(PDA_cpu, i386_pda, cpu_number); OFFSET(PDA_pcurrent, i386_pda, pcurrent); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); + OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); + OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); + OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); +#endif } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 0220bc8cbb43..d274612e05cd 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -62,13 +62,6 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 -/* These are replaces for paravirtualization */ -#define DISABLE_INTERRUPTS cli -#define ENABLE_INTERRUPTS sti -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit -#define INTERRUPT_RETURN iret -#define GET_CR0_INTO_EAX movl %cr0, %eax - #ifdef CONFIG_PREEMPT #define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else @@ -416,6 +409,20 @@ ldt_ss: jnz restore_nocheck testl $0x00400000, %eax # returning to 32bit stack? jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, paravirt_ops+PARAVIRT_enabled + jne restore_nocheck +#endif + /* If returning to userspace with 16bit stack, * try to fix the higher word of ESP, as the CPU * won't restore it. @@ -833,6 +840,19 @@ nmi_espfix_stack: .previous KPROBE_END(nmi) +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +#endif + KPROBE_ENTRY(int3) RING0_INT_FRAME pushl $-1 # mark this as an int diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 62996cd17084..c8d45821c788 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -381,7 +381,10 @@ void __init init_ISA_irqs (void) } } -void __init init_IRQ(void) +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) { int i; diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c new file mode 100644 index 000000000000..478192cd4b90 --- /dev/null +++ b/arch/i386/kernel/paravirt.c @@ -0,0 +1,404 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* nop stub */ +static void native_nop(void) +{ +} + +static void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); +} + +char *memory_setup(void) +{ + return paravirt_ops.memory_setup(); +} + +static fastcall unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("movl %%db0, %0" :"=r" (val)); break; + case 1: + asm("movl %%db1, %0" :"=r" (val)); break; + case 2: + asm("movl %%db2, %0" :"=r" (val)); break; + case 3: + asm("movl %%db3, %0" :"=r" (val)); break; + case 6: + asm("movl %%db6, %0" :"=r" (val)); break; + case 7: + asm("movl %%db7, %0" :"=r" (val)); break; + default: + BUG(); + } + return val; +} + +static fastcall void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("movl %0,%%db0" : /* no output */ :"r" (value)); + break; + case 1: + asm("movl %0,%%db1" : /* no output */ :"r" (value)); + break; + case 2: + asm("movl %0,%%db2" : /* no output */ :"r" (value)); + break; + case 3: + asm("movl %0,%%db3" : /* no output */ :"r" (value)); + break; + case 6: + asm("movl %0,%%db6" : /* no output */ :"r" (value)); + break; + case 7: + asm("movl %0,%%db7" : /* no output */ :"r" (value)); + break; + default: + BUG(); + } +} + +void init_IRQ(void) +{ + paravirt_ops.init_IRQ(); +} + +static fastcall void native_clts(void) +{ + asm volatile ("clts"); +} + +static fastcall unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr0(unsigned long val) +{ + asm volatile("movl %0,%%cr0": :"r" (val)); +} + +static fastcall unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr2(unsigned long val) +{ + asm volatile("movl %0,%%cr2": :"r" (val)); +} + +static fastcall unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall void native_write_cr3(unsigned long val) +{ + asm volatile("movl %0,%%cr3": :"r" (val)); +} + +static fastcall unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); + return val; +} + +static fastcall unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist */ + asm("1: movl %%cr4, %0 \n" + "2: \n" + ".section __ex_table,\"a\" \n" + ".long 1b,2b \n" + ".previous \n" + : "=r" (val): "0" (0)); + return val; +} + +static fastcall void native_write_cr4(unsigned long val) +{ + asm volatile("movl %0,%%cr4": :"r" (val)); +} + +static fastcall unsigned long native_save_fl(void) +{ + unsigned long f; + asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); + return f; +} + +static fastcall void native_restore_fl(unsigned long f) +{ + asm volatile("pushl %0 ; popfl": /* no output */ + :"g" (f) + :"memory", "cc"); +} + +static fastcall void native_irq_disable(void) +{ + asm volatile("cli": : :"memory"); +} + +static fastcall void native_irq_enable(void) +{ + asm volatile("sti": : :"memory"); +} + +static fastcall void native_safe_halt(void) +{ + asm volatile("sti; hlt": : :"memory"); +} + +static fastcall void native_halt(void) +{ + asm volatile("hlt": : :"memory"); +} + +static fastcall void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) +{ + unsigned long long val; + + asm volatile("2: rdmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %3,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=r" (*err), "=A" (val) + : "c" (msr), "i" (-EFAULT)); + + return val; +} + +static fastcall int native_write_msr(unsigned int msr, unsigned long long val) +{ + int err; + asm volatile("2: wrmsr ; xorl %0,%0\n" + "1:\n\t" + ".section .fixup,\"ax\"\n\t" + "3: movl %4,%0 ; jmp 1b\n\t" + ".previous\n\t" + ".section __ex_table,\"a\"\n" + " .align 4\n\t" + " .long 2b,3b\n\t" + ".previous" + : "=a" (err) + : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)), + "i" (-EFAULT)); + return err; +} + +static fastcall unsigned long long native_read_tsc(void) +{ + unsigned long long val; + asm volatile("rdtsc" : "=A" (val)); + return val; +} + +static fastcall unsigned long long native_read_pmc(void) +{ + unsigned long long val; + asm volatile("rdpmc" : "=A" (val)); + return val; +} + +static fastcall void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) +{ + asm ("sgdt %0":"=m" (*dtr)); +} + +static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) +{ + asm ("sidt %0":"=m" (*dtr)); +} + +static fastcall unsigned long native_store_tr(void) +{ + unsigned long tr; + asm ("str %0":"=r" (tr)); + return tr; +} + +static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ +#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] + C(0); C(1); C(2); +#undef C +} + +static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) +{ + u32 *lp = (u32 *)((char *)dt + entry*8); + lp[0] = entry_low; + lp[1] = entry_high; +} + +static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +{ + native_write_dt_entry(dt, entrynum, low, high); +} + +static fastcall void native_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + tss->esp0 = thread->esp0; + + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ + if (unlikely(tss->ss1 != thread->sysenter_cs)) { + tss->ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +} + +static fastcall void native_io_delay(void) +{ + asm volatile("outb %al,$0x80"); +} + +/* These are in entry.S */ +extern fastcall void native_iret(void); +extern fastcall void native_irq_enable_sysexit(void); + +static int __init print_banner(void) +{ + paravirt_ops.banner(); + return 0; +} +core_initcall(print_banner); + +struct paravirt_ops paravirt_ops = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + + .banner = default_banner, + .arch_setup = native_nop, + .memory_setup = machine_specific_memory_setup, + .get_wallclock = native_get_wallclock, + .set_wallclock = native_set_wallclock, + .time_init = time_init_hook, + .init_IRQ = native_init_IRQ, + + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, + .clts = native_clts, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = native_write_cr4, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, + .wbinvd = native_wbinvd, + .read_msr = native_read_msr, + .write_msr = native_write_msr, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + .load_esp0 = native_load_esp0, + + .set_iopl_mask = native_set_iopl_mask, + .io_delay = native_io_delay, + .const_udelay = __const_udelay, + + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, +}; +EXPORT_SYMBOL(paravirt_ops); diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index e5bb87aa5a45..695d53fd14de 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -495,6 +495,12 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +char * __attribute__((weak)) memory_setup(void) +{ + return machine_specific_memory_setup(); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -547,7 +553,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); else { printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(machine_specific_memory_setup()); + print_memory_map(memory_setup()); } copy_edd(); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 095636620fa2..cd7de9c9654b 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -33,6 +33,11 @@ * Dave Jones : Report invalid combinations of Athlon CPUs. * Rusty Russell : Hacked into shape for new "hotplug" boot process. */ + +/* SMP boot always wants to use real time delay to allow sufficient time for + * the APs to come online */ +#define USE_REAL_TIME_DELAY + #include #include #include diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 78af572fd17c..c505b16c0990 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "mach_time.h" @@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime) /* gets recalled with irq locally disabled */ /* XXX - does irqsave resolve this? -johnstul */ spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_set_rtc_mmss(nowtime); - else - retval = mach_set_rtc_mmss(nowtime); + retval = set_wallclock(nowtime); spin_unlock_irqrestore(&rtc_lock, flags); return retval; @@ -223,10 +221,7 @@ unsigned long get_cmos_time(void) spin_lock_irqsave(&rtc_lock, flags); - if (efi_enabled) - retval = efi_get_time(); - else - retval = mach_get_cmos_time(); + retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); @@ -370,7 +365,7 @@ static void __init hpet_time_init(void) printk("Using HPET for base-timer\n"); } - time_init_hook(); + do_time_init(); } #endif @@ -392,5 +387,5 @@ void __init time_init(void) do_settimeofday(&ts); - time_init_hook(); + do_time_init(); } diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 5a1abeff033b..2c15500f8713 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -26,8 +26,8 @@ void __save_processor_state(struct saved_context *ctxt) /* * descriptor tables */ - store_gdt(&ctxt->gdt_limit); - store_idt(&ctxt->idt_limit); + store_gdt(&ctxt->gdt); + store_idt(&ctxt->idt); store_tr(ctxt->tr); /* @@ -99,8 +99,8 @@ void __restore_processor_state(struct saved_context *ctxt) * now restore the descriptor tables to their proper values * ltr is done i fix_processor_context(). */ - load_gdt(&ctxt->gdt_limit); - load_idt(&ctxt->idt_limit); + load_gdt(&ctxt->gdt); + load_idt(&ctxt->idt); /* * segment registers -- cgit v1.2.1 From 139ec7c416248b9ea227d21839235344edfee1e0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Patch inline replacements for paravirt intercepts It turns out that the most called ops, by several orders of magnitude, are the interrupt manipulation ops. These are obvious candidates for patching, so mark them up and create infrastructure for it. The method used is that the ops structure has a patch function, which is called for each place which needs to be patched: this returns a number of instructions (the rest are NOP-padded). Usually we can spare a register (%eax) for the binary patched code to use, but in a couple of critical places in entry.S we can't: we make the clobbers explicit at the call site, and manually clobber the allowed registers in debug mode as an extra check. And: Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT. And: AK: Fix warnings in x86-64 alternative.c build And: AK: Fix compilation with defconfig And: ^From: Andrew Morton Some binutlises still like to emit references to __stop_parainstructions and __start_parainstructions. And: AK: Fix warnings about unused variables when PARAVIRT is disabled. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Chris Wright Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/Kconfig.debug | 10 +++++++ arch/i386/kernel/alternative.c | 63 ++++++++++++++++++++++++++++++++++-------- arch/i386/kernel/entry.S | 39 +++++++++++++++++--------- arch/i386/kernel/module.c | 11 +++++++- arch/i386/kernel/paravirt.c | 44 +++++++++++++++++++++++++++++ arch/i386/kernel/vmlinux.lds.S | 6 ++++ 6 files changed, 148 insertions(+), 25 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig.debug b/arch/i386/Kconfig.debug index b31c0802e1cc..f68cc6f215f8 100644 --- a/arch/i386/Kconfig.debug +++ b/arch/i386/Kconfig.debug @@ -85,4 +85,14 @@ config DOUBLEFAULT option saves about 4k and might cause you much additional grey hair. +config DEBUG_PARAVIRT + bool "Enable some paravirtualization debugging" + default y + depends on PARAVIRT && DEBUG_KERNEL + help + Currently deliberately clobbers regs which are allowed to be + clobbered in inlined paravirt hooks, even in native mode. + If turning this off solves a problem, then DISABLE_INTERRUPTS() or + ENABLE_INTERRUPTS() is lying about what registers can be clobbered. + endmenu diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 535f9794fba1..9eca21b49f6b 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -124,6 +124,20 @@ static unsigned char** find_nop_table(void) #endif /* CONFIG_X86_64 */ +static void nop_out(void *insns, unsigned int len) +{ + unsigned char **noptable = find_nop_table(); + + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, noptable[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; @@ -138,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[]; void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { - unsigned char **noptable = find_nop_table(); struct alt_instr *a; u8 *instr; - int diff, i, k; + int diff; DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); for (a = start; a < end; a++) { @@ -159,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) #endif memcpy(instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - memcpy(a->instr + i, noptable[k], k); - } + nop_out(instr + a->replacementlen, diff); } } @@ -209,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { - unsigned char **noptable = find_nop_table(); u8 **ptr; for (ptr = start; ptr < end; ptr++) { @@ -217,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end continue; if (*ptr > text_end) continue; - **ptr = noptable[1][0]; + nop_out(*ptr, 1); }; } @@ -343,6 +349,40 @@ void alternatives_smp_switch(int smp) #endif +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) +{ + struct paravirt_patch *p; + + for (p = start; p < end; p++) { + unsigned int used; + + used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr, + p->len); +#ifdef CONFIG_DEBUG_PARAVIRT + { + int i; + /* Deliberately clobber regs using "not %reg" to find bugs. */ + for (i = 0; i < 3; i++) { + if (p->len - used >= 2 && (p->clobbers & (1 << i))) { + memcpy(p->instr + used, "\xf7\xd0", 2); + p->instr[used+1] |= i; + used += 2; + } + } + } +#endif + /* Pad the rest with nops */ + nop_out(p->instr + used, p->len - used); + } + + /* Sync to be conservative, in case we patched following instructions */ + sync_core(); +} +extern struct paravirt_patch __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + void __init alternative_instructions(void) { unsigned long flags; @@ -390,5 +430,6 @@ void __init alternative_instructions(void) alternatives_smp_switch(0); } #endif + apply_paravirt(__start_parainstructions, __stop_parainstructions); local_irq_restore(flags); } diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index d274612e05cd..de34b7fed3c1 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -53,6 +53,19 @@ #include #include "irq_vectors.h" +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + #define nr_syscalls ((syscall_table_size)/4) CF_MASK = 0x00000001 @@ -63,9 +76,9 @@ NT_MASK = 0x00004000 VM_MASK = 0x00020000 #ifdef CONFIG_PREEMPT -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else -#define preempt_stop +#define preempt_stop(clobbers) #define resume_kernel restore_nocheck #endif @@ -226,7 +239,7 @@ ENTRY(ret_from_fork) ALIGN RING0_PTREGS_FRAME ret_from_exception: - preempt_stop + preempt_stop(CLBR_ANY) ret_from_intr: GET_THREAD_INFO(%ebp) check_userspace: @@ -237,7 +250,7 @@ check_userspace: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -248,7 +261,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ANY) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -277,7 +290,7 @@ sysenter_past_esp: * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: */ - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $(__USER_DS) CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ss, 0*/ @@ -322,7 +335,7 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx @@ -364,7 +377,7 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) # store the return value syscall_exit: - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -393,7 +406,7 @@ restore_nocheck_notrace: .section .fixup,"ax" iret_exc: TRACE_IRQS_ON - ENABLE_INTERRUPTS + ENABLE_INTERRUPTS(CLBR_NONE) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -436,7 +449,7 @@ ldt_ss: CFI_ADJUST_CFA_OFFSET 4 pushl %eax CFI_ADJUST_CFA_OFFSET 4 - DISABLE_INTERRUPTS + DISABLE_INTERRUPTS(CLBR_EAX) TRACE_IRQS_OFF lss (%esp), %esp CFI_ADJUST_CFA_OFFSET -8 @@ -451,7 +464,7 @@ work_pending: jz work_notifysig work_resched: call schedule - DISABLE_INTERRUPTS # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF @@ -509,7 +522,7 @@ syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -693,7 +706,7 @@ ENTRY(device_not_available) GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate - preempt_stop + preempt_stop(CLBR_ANY) call math_state_restore jmp ret_from_exception device_not_available_emulate: diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c index 470cf97e7cd3..d7d9c8b23f72 100644 --- a/arch/i386/kernel/module.c +++ b/arch/i386/kernel/module.c @@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -132,6 +135,12 @@ int module_finalize(const Elf_Ehdr *hdr, lseg, lseg + locks->sh_size, tseg, tseg + text->sh_size); } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return 0; } diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 478192cd4b90..d46460426446 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -45,6 +45,49 @@ char *memory_setup(void) return paravirt_ops.memory_setup(); } +/* Simple instruction patching code. */ +#define DEF_NATIVE(name, code) \ + extern const char start_##name[], end_##name[]; \ + asm("start_" #name ": " code "; end_" #name ":") +DEF_NATIVE(cli, "cli"); +DEF_NATIVE(sti, "sti"); +DEF_NATIVE(popf, "push %eax; popf"); +DEF_NATIVE(pushf, "pushf; pop %eax"); +DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli"); +DEF_NATIVE(iret, "iret"); +DEF_NATIVE(sti_sysexit, "sti; sysexit"); + +static const struct native_insns +{ + const char *start, *end; +} native_insns[] = { + [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli }, + [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti }, + [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf }, + [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf }, + [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli }, + [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret }, + [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit }, +}; + +static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len) +{ + unsigned int insn_len; + + /* Don't touch it if we don't have a replacement */ + if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start) + return len; + + insn_len = native_insns[type].end - native_insns[type].start; + + /* Similarly if we can't fit replacement. */ + if (len < insn_len) + return len; + + memcpy(insns, native_insns[type].start, insn_len); + return insn_len; +} + static fastcall unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -349,6 +392,7 @@ struct paravirt_ops paravirt_ops = { .paravirt_enabled = 0, .kernel_rpl = 0, + .patch = native_patch, .banner = default_banner, .arch_setup = native_nop, .memory_setup = machine_specific_memory_setup, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6860f20aa579..5c69cf0e5944 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -165,6 +165,12 @@ SECTIONS .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { *(.altinstr_replacement) } + . = ALIGN(4); + __start_parainstructions = .; + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + *(.parainstructions) + } + __stop_parainstructions = .; /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } -- cgit v1.2.1 From d7cd56111f30259e1b532a12e06f59f8e0a20355 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] i386: cpu_detect extraction Both lhype and Xen want to call the core of the x86 cpu detect code before calling start_kernel. (extracted from larger patch) AK: folded in start_kernel header patch Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index cda41aef79ad..68bcb687019a 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -236,29 +236,14 @@ static int __cpuinit have_cpuid_p(void) return flag_is_changeable_p(X86_EFLAGS_ID); } -/* Do minimum CPU detection early. - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. - The others are not touched to avoid unwanted side effects. - - WARNING: this function is only called on the BP. Don't add code here - that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +void __init cpu_detect(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; - - if (!have_cpuid_p()) - return; - /* Get vendor name */ cpuid(0x00000000, &c->cpuid_level, (int *)&c->x86_vendor_id[0], (int *)&c->x86_vendor_id[8], (int *)&c->x86_vendor_id[4]); - get_cpu_vendor(c, 1); - c->x86 = 4; if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; @@ -275,6 +260,26 @@ static void __init early_cpu_detect(void) } } +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. + + WARNING: this function is only called on the BP. Don't add code here + that is supposed to run on all CPUs. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c, 1); +} + static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; -- cgit v1.2.1 From c9ccf30d77f04064fe5436027ab9d2230c7cdd94 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add startup infrastructure for paravirtualization 1) Each hypervisor writes a probe function to detect whether we are running under that hypervisor. paravirt_probe() registers this function. 2) If vmlinux is booted with ring != 0, we call all the probe functions (with registers except %esp intact) in link order: the winner will not return. Signed-off-by: Rusty Russell Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Andrew Morton --- arch/i386/kernel/Makefile | 2 ++ arch/i386/kernel/head.S | 33 +++++++++++++++++++++++++++++++++ arch/i386/kernel/paravirt.c | 4 ++++ arch/i386/kernel/vmlinux.lds.S | 6 ++++++ 4 files changed, 45 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 406612136049..1e8988e558c5 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -39,6 +39,8 @@ obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o + +# Make sure this is linked after any other paravirt_ops structs: see head.S obj-$(CONFIG_PARAVIRT) += paravirt.o EXTRA_AFLAGS := -traditional diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 5b14e95ac8b9..edef5084ce17 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -55,6 +55,12 @@ */ ENTRY(startup_32) +#ifdef CONFIG_PARAVIRT + movl %cs, %eax + testl $0x3, %eax + jnz startup_paravirt +#endif + /* * Set segments to known values. */ @@ -486,6 +492,33 @@ ignore_int: #endif iret +#ifdef CONFIG_PARAVIRT +startup_paravirt: + cld + movl $(init_thread_union+THREAD_SIZE),%esp + + /* We take pains to preserve all the regs. */ + pushl %edx + pushl %ecx + pushl %eax + + /* paravirt.o is last in link, and that probe fn never returns */ + pushl $__start_paravirtprobe +1: + movl 0(%esp), %eax + pushl (%eax) + movl 8(%esp), %eax + call *(%esp) + popl %eax + + movl 4(%esp), %eax + movl 8(%esp), %ecx + movl 12(%esp), %edx + + addl $4, (%esp) + jmp 1b +#endif + /* * Real beginning of normal "text" segment */ diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index d46460426446..5a9bd3250a1a 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -387,6 +388,9 @@ static int __init print_banner(void) } core_initcall(print_banner); +/* We simply declare start_kernel to be the paravirt probe of last resort. */ +paravirt_probe(start_kernel); + struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 5c69cf0e5944..877dc5cfe3a8 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -65,6 +65,12 @@ SECTIONS CONSTRUCTORS } :data + __start_paravirtprobe = .; + .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) { + *(.paravirtprobe) + } + __stop_paravirtprobe = .; + . = ALIGN(4096); .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { __nosave_begin = .; -- cgit v1.2.1 From 4f205fd45a5c192907188d6f8f6d7e66db859248 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Allow selected bug checks to be Allow selected bug checks to be skipped by paravirt kernels. The two most important are the F00F workaround (which is either done by the hypervisor, or not required), and the 'hlt' instruction check, which can break under some hypervisors. Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 798c2f617e87..3ae795e9056d 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; - if ( c->x86 == 5 ) { + if (!paravirt_enabled() && c->x86 == 5) { static int f00f_workaround_enabled = 0; c->f00f_bug = 1; -- cgit v1.2.1 From 3bbf54725467d604698721384d858b5983b87e8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Disable vdso by default when CONFIG_PARAVIRT is enabled They don't work together and this way even glibc still works. Signed-off-by: Andi Kleen --- arch/i386/kernel/sysenter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 713ba39d32c6..92849c7def5a 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -27,7 +27,11 @@ * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ +#ifdef CONFIG_PARAVIRT +unsigned int __read_mostly vdso_enabled = 0; +#else unsigned int __read_mostly vdso_enabled = 1; +#endif EXPORT_SYMBOL_GPL(vdso_enabled); -- cgit v1.2.1 From 6020c8f315709a508b027ef6749e85b125190947 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Allow disable power management under hypervisor Two legacy power management modes are much easier to just explicitly disable when running in paravirtualized mode - neither APM nor PnP is still relevant. The status of ACPI is still debatable, and noacpi is still a common enough boot parameter that it is not necessary to explicitly disable ACPI. Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/apm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index a60358fe9a49..a97847da9ed5 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -231,6 +231,7 @@ #include #include #include +#include #include "io_ports.h" @@ -2235,7 +2236,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0) { + if (apm_info.bios.version == 0 || paravirt_enabled()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } -- cgit v1.2.1 From 13623d79309dd82e1964458fa017979d16f33fa8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add APIC accessors to paravirt-ops. Add APIC accessors to paravirt-ops. Unfortunately, we need two write functions, as some older broken hardware requires workarounds for Pentium APIC errata - this is the purpose of apic_write_atomic. AK: replaced __inline with inline Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 5a9bd3250a1a..fe82eb3adf42 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include /* nop stub */ static void native_nop(void) @@ -446,6 +448,12 @@ struct paravirt_ops paravirt_ops = { .io_delay = native_io_delay, .const_udelay = __const_udelay, +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = native_apic_write, + .apic_write_atomic = native_apic_write_atomic, + .apic_read = native_apic_read, +#endif + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; -- cgit v1.2.1 From da181a8b3916aa7f2e3c5775d2bd2fe3454cf82d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Add MMU virtualization to paravirt_ops Add the three bare TLB accessor functions to paravirt-ops. Most amusingly, flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb. Instead, I chose to indicate the actual flush type, kernel (global) vs. user (non-global). Global in this sense means using the global bit in the page table entry, which makes TLB entries persistent across CR3 reloads, not global as in the SMP sense of invoking remote shootdowns, so the term is confusingly overloaded. AK: folded in fix from Zach for PAE compilation Signed-off-by: Zachary Amsden Signed-off-by: Chris Wright Signed-off-by: Andi Kleen Cc: Rusty Russell Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/paravirt.c | 109 ++++++++++++++++++++++++++++++++++++++++++++ arch/i386/mm/boot_ioremap.c | 1 + 2 files changed, 110 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index fe82eb3adf42..3dceab5828f1 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -31,6 +31,7 @@ #include #include #include +#include /* nop stub */ static void native_nop(void) @@ -379,6 +380,97 @@ static fastcall void native_io_delay(void) asm volatile("outb %al,$0x80"); } +static fastcall void native_flush_tlb(void) +{ + __native_flush_tlb(); +} + +/* + * Global pages have to be flushed a bit differently. Not a real + * performance problem because this does not happen often. + */ +static fastcall void native_flush_tlb_global(void) +{ + __native_flush_tlb_global(); +} + +static fastcall void native_flush_tlb_single(u32 addr) +{ + __native_flush_tlb_single(addr); +} + +#ifndef CONFIG_X86_PAE +static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; +} + +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +{ + *ptep = pteval; +} + +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + *pmdp = pmdval; +} + +#else /* CONFIG_X86_PAE */ + +static fastcall void native_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +{ + set_64bit((unsigned long long *)ptep,pte_val(pteval)); +} + +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); +} + +static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) +{ + *pudp = pudval; +} + +static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +static fastcall void native_pmd_clear(pmd_t *pmd) +{ + u32 *tmp = (u32 *)pmd; + *tmp = 0; + smp_wmb(); + *(tmp + 1) = 0; +} +#endif /* CONFIG_X86_PAE */ + /* These are in entry.S */ extern fastcall void native_iret(void); extern fastcall void native_irq_enable_sysexit(void); @@ -454,6 +546,23 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, #endif + .flush_tlb_user = native_flush_tlb, + .flush_tlb_kernel = native_flush_tlb_global, + .flush_tlb_single = native_flush_tlb_single, + + .set_pte = native_set_pte, + .set_pte_at = native_set_pte_at, + .set_pmd = native_set_pmd, + .pte_update = (void *)native_nop, + .pte_update_defer = (void *)native_nop, +#ifdef CONFIG_X86_PAE + .set_pte_atomic = native_set_pte_atomic, + .set_pte_present = native_set_pte_present, + .set_pud = native_set_pud, + .pte_clear = native_pte_clear, + .pmd_clear = native_pmd_clear, +#endif + .irq_enable_sysexit = native_irq_enable_sysexit, .iret = native_iret, }; diff --git a/arch/i386/mm/boot_ioremap.c b/arch/i386/mm/boot_ioremap.c index 4de11f508c3a..4de95a17a7d4 100644 --- a/arch/i386/mm/boot_ioremap.c +++ b/arch/i386/mm/boot_ioremap.c @@ -16,6 +16,7 @@ */ #undef CONFIG_X86_PAE +#undef CONFIG_PARAVIRT #include #include #include -- cgit v1.2.1 From bd472c794bbf6771c3fc1c58f188bc16c393d2fe Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 7 Dec 2006 02:14:08 +0100 Subject: [PATCH] paravirt: Be careful about touching BIOS address space BIOS ROM areas may not be mapped into the guest address space, so be careful when touching those addresses to make sure they appear to be mapped. [akpm@osdl.org: fix unused var warning] AK: Changed __get_user to probe_kernel_address Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Signed-off-by: Andi Kleen Cc: Jeremy Fitzhardinge Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/e820.c | 10 +++++++++- arch/i386/pci/pcbios.c | 11 +++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index b755255f2721..b704790f7969 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -155,7 +156,14 @@ static struct resource standard_io_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) +static int romsignature(const unsigned char *x) +{ + unsigned short sig; + int ret = 0; + if (probe_kernel_address((const unsigned short *)x, sig) == 0) + ret = (sig == 0xaa55); + return ret; +} static int __init romchecksum(unsigned char *rom, unsigned long length) { diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c index ed1512a175ab..5f5193401bea 100644 --- a/arch/i386/pci/pcbios.c +++ b/arch/i386/pci/pcbios.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "pci.h" #include "pci-functions.h" @@ -314,6 +315,10 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) for (check = (union bios32 *) __va(0xe0000); check <= (union bios32 *) __va(0xffff0); ++check) { + long sig; + if (probe_kernel_address(&check->fields.signature, sig)) + continue; + if (check->fields.signature != BIOS32_SIGNATURE) continue; length = check->fields.length * 16; @@ -331,11 +336,13 @@ static struct pci_raw_ops * __devinit pci_find_bios(void) } DBG("PCI: BIOS32 Service Directory structure at 0x%p\n", check); if (check->fields.entry >= 0x100000) { - printk("PCI: BIOS32 entry (0x%p) in high memory, cannot use.\n", check); + printk("PCI: BIOS32 entry (0x%p) in high memory, " + "cannot use.\n", check); return NULL; } else { unsigned long bios32_entry = check->fields.entry; - DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", bios32_entry); + DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", + bios32_entry); bios32_indirect.address = bios32_entry + PAGE_OFFSET; if (check_pcibios()) return &pci_bios_access; -- cgit v1.2.1 From 8542b200cbe5609edd7aae0c304c091a1c290452 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] paravirt: Add option to allow skipping the timer check Add a way to disable the timer IRQ routing check via a boot option. The VMI timer code uses this to avoid triggering the pester Mingo code, which probes for some very unusual and broken motherboard routings. It fires 100% of the time when using a paravirtual delay mechanism instead of using a realtime delay, since there is no elapsed real time, and the 4 timer IRQs have not yet been delivered. In addition, it is entirely possible, though improbable, that this bug could surface on real hardware which picks a particularly bad time to enter SMM mode, causing a long latency during one of the timer IRQs. While here, make check_timer be __init. Signed-off-by: Zachary Amsden Signed-off-by: Andi Kleen [chrisw: use no_timer_check to bring inline with x86_64 as per Andi's request] Signed-off-by: Chris Wright Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index e33b7a845299..993150f206ed 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1932,6 +1932,15 @@ static void __init setup_ioapic_ids_from_mpc(void) static void __init setup_ioapic_ids_from_mpc(void) { } #endif +static int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1940,10 +1949,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { } * - if this function detects that timer IRQs are defunct, then we fall * back to ISA timer IRQs */ -static int __init timer_irq_works(void) +int __init timer_irq_works(void) { unsigned long t1 = jiffies; + if (no_timer_check) + return 1; + local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); @@ -2214,7 +2226,7 @@ int timer_uses_ioapic_pin_0; * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector; -- cgit v1.2.1 From c55d92d141b9c40c67db249de91f5c224eb49859 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: Add support for compilation for Core2 gcc doesn't support -mtune=core2 yet, but will be soon. Use -mtune=generic or -mtune=i686 as fallback TBD need benchmarking for INTEL_USERCOPY etc. So far I used the same defaults as MPENTIUMM Signed-off-by: Andi Kleen --- arch/i386/Kconfig.cpu | 19 +++++++++++++------ arch/i386/Makefile.cpu | 1 + 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index fc4f2abccf06..821fd269ca58 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -103,8 +103,15 @@ config MPENTIUMM Select this for Intel Pentium M (not Pentium-4 M) notebook chips. +config MCORE2 + bool "Core 2/newer Xeon" + help + Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and 53xx) + CPUs. You can distingush newer from older Xeons by the CPU family + in /proc/cpuinfo. Newer ones have 6. + config MPENTIUM4 - bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" + bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" help Select this for Intel Pentium 4 chips. This includes the Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M @@ -229,7 +236,7 @@ config X86_L1_CACHE_SHIFT default "7" if MPENTIUM4 || X86_GENERIC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 config RWSEM_GENERIC_SPINLOCK bool @@ -287,17 +294,17 @@ config X86_ALIGNMENT_16 config X86_GOOD_APIC bool - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON + depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 default y config X86_INTEL_USERCOPY bool - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 default y config X86_USE_PPRO_CHECKSUM bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 default y config X86_USE_3DNOW @@ -312,5 +319,5 @@ config X86_OOSTORE config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ default y diff --git a/arch/i386/Makefile.cpu b/arch/i386/Makefile.cpu index a11befba26d5..a32c031c90d7 100644 --- a/arch/i386/Makefile.cpu +++ b/arch/i386/Makefile.cpu @@ -32,6 +32,7 @@ cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) +cflags-$(CONFIG_MCORE2) += -march=i686 $(call cc-option,-mtune=core2,$(call cc-option,-mtune=generic,-mtune=i686)) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 -- cgit v1.2.1 From b0bfece40b1988aa8e3d910938691dce7859d82d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: clear_fixmap() should not use set_pte() While not strictly required with the current code (as the upper half of page table entries generated by __set_fixmap() cannot be non-zero due to the second parameter of this function being 'unsigned long'), the use of set_pte() in __set_fixmap() in the context of clear_fixmap() is still improper with CONFIG_X86_PAE (see the respective comment in include/asm-i386/pgtable-3level.h) and would turn into a bug if that second parameter ever gets changed to a 64-bit type. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/mm/pgtable.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 10126e3f8174..65b5c0959033 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -95,8 +95,11 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) return; } pte = pte_offset_kernel(pmd, vaddr); - /* stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte(pfn, flags)); + if (pgprot_val(flags)) + /* stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte(pfn, flags)); + else + pte_clear(&init_mm, vaddr, pte); /* * It's enough to flush this one mapping. -- cgit v1.2.1 From c6ea396de6836bdeb2d2433368130642bf0f6e15 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: Don't touch per cpu memory of offline CPUs in touch_nmi_watchdog Just like on x86-64, don't touch foreign CPUs' memory if the watchdog isn't enabled at all. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 171194ccb7bc..f5bc7e1be801 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -870,14 +870,16 @@ static unsigned int void touch_nmi_watchdog (void) { - int i; + if (nmi_watchdog > 0) { + unsigned cpu; - /* - * Just reset the alert counters, (other CPUs might be - * spinning on locks we hold): - */ - for_each_possible_cpu(i) - alert_counter[i] = 0; + /* + * Just reset the alert counters, (other CPUs might be + * spinning on locks we hold): + */ + for_each_present_cpu (cpu) + alert_counter[cpu] = 0; + } /* * Tickle the softlockup detector too: -- cgit v1.2.1 From 475850c86b908ae026d5a4be02a1b1e9c408c75a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: conditionalize inclusion of some MTRR flavors Avoid inclusion of code that's dead for x86-64. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/Makefile | 4 +--- arch/i386/kernel/cpu/mtrr/main.c | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile index a25b701ab84e..191fc0533649 100644 --- a/arch/i386/kernel/cpu/mtrr/Makefile +++ b/arch/i386/kernel/cpu/mtrr/Makefile @@ -1,5 +1,3 @@ obj-y := main.o if.o generic.o state.o -obj-y += amd.o -obj-y += cyrix.o -obj-y += centaur.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 2b8b0b361ccb..a4de30b9d3d3 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); +#ifndef CONFIG_X86_64 extern int arr3_protected; +#else +#define arr3_protected 0 +#endif void set_mtrr_ops(struct mtrr_ops * ops) { @@ -544,9 +548,11 @@ extern void centaur_init_mtrr(void); static void __init init_ifs(void) { +#ifndef CONFIG_X86_64 amd_init_mtrr(); cyrix_init_mtrr(); centaur_init_mtrr(); +#endif } /* The suspend/resume methods are only for CPU without MTRR. CPU using generic -- cgit v1.2.1 From 365bff806e9faba000fb4956c7486fbf3a746d96 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:09 +0100 Subject: [PATCH] i386: fix MTRR code Until not so long ago, there were system log messages pointing to inconsistent MTRR setup of the video frame buffer caused by the way vesafb and X worked. While vesafb was fixed meanwhile, I believe fixing it there only hides a shortcoming in the MTRR code itself, in that that code is not symmetric with respect to the ordering of attempts to set up two (or more) regions where one contains the other. In the current shape, it permits only setting up sub-regions of pre-exisiting ones. The patch below makes this symmetric. While working on that I noticed a few more inconsistencies in that code, namely - use of 'unsigned int' for sizes in many, but not all places (the patch is converting this to use 'unsigned long' everywhere, which specifically might be necessary for x86-64 once a processor supporting more than 44 physical address bits would become available) - the code to correct inconsistent settings during secondary processor startup tried (if necessary) to correct, among other things, the value in IA32_MTRR_DEF_TYPE, however the newly computed value would never get used (i.e. stored in the respective MSR) - the generic range validation code checked that the end of the to-be-added range would be above 1MB; the value checked should have been the start of the range - when contained regions are detected, previously this was allowed only when the old region was uncacheable; this can be symmetric (i.e. the new region can also be uncacheable) and even further as per Intel's documentation write-trough and write-back for either region is also compatible with the respective opposite in the other Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/amd.c | 2 +- arch/i386/kernel/cpu/mtrr/centaur.c | 9 +++-- arch/i386/kernel/cpu/mtrr/cyrix.c | 25 +++++++++--- arch/i386/kernel/cpu/mtrr/generic.c | 78 +++++++++++++++++++++++++++++++------ arch/i386/kernel/cpu/mtrr/if.c | 28 +++++++------ arch/i386/kernel/cpu/mtrr/main.c | 55 +++++++++++++++++++------- arch/i386/kernel/cpu/mtrr/mtrr.h | 25 ++++++------ 7 files changed, 162 insertions(+), 60 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c index 1a1e04b6fd00..0949cdbf848a 100644 --- a/arch/i386/kernel/cpu/mtrr/amd.c +++ b/arch/i386/kernel/cpu/mtrr/amd.c @@ -7,7 +7,7 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { unsigned long low, high; diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c index 33f00ac314ef..cb9aa3a7a7ab 100644 --- a/arch/i386/kernel/cpu/mtrr/centaur.c +++ b/arch/i386/kernel/cpu/mtrr/centaur.c @@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ */ static int -centaur_get_free_region(unsigned long base, unsigned long size) +centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. The size (in bytes) of the region. @@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) static void centaur_get_mcr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c index 9027a987006b..0737a596db43 100644 --- a/arch/i386/kernel/cpu/mtrr/cyrix.c +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -9,7 +9,7 @@ int arr3_protected; static void cyrix_get_arr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type * type) { unsigned long flags; unsigned char arr, ccr3, rcr, shift; @@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } static int -cyrix_get_free_region(unsigned long base, unsigned long size) +cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free ARR. The starting (base) address of the region. The size (in bytes) of the region. @@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size) { int i; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; + switch (replace_reg) { + case 7: + if (size < 0x40) + break; + case 6: + case 5: + case 4: + return replace_reg; + case 3: + if (arr3_protected) + break; + case 2: + case 1: + case 0: + return replace_reg; + } /* If we are to set up a region >32M then look at ARR7 immediately */ if (size > 0x2000) { cyrix_get_arr(7, &lbase, &lsize, <ype); @@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, typedef struct { unsigned long base; - unsigned int size; + unsigned long size; mtrr_type type; } arr_state_t; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index ee8dc6753953..f77fc53db654 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -15,12 +16,19 @@ struct mtrr_state { struct mtrr_var_range *var_ranges; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; unsigned char enabled; + unsigned char have_fixed; mtrr_type def_type; }; static unsigned long smp_changes_mask; static struct mtrr_state mtrr_state = {}; +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "mtrr." + +static __initdata int mtrr_show; +module_param_named(show, mtrr_show, bool, 0); + /* Get the MSR pair relating to a var range */ static void __init get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) @@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } +static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types) +{ + unsigned i; + + for (i = 0; i < 8; ++i, ++types, base += step) + printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types)); +} + /* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { @@ -58,13 +74,49 @@ void __init get_mtrr_state(void) } vrs = mtrr_state.var_ranges; + rdmsr(MTRRcap_MSR, lo, dummy); + mtrr_state.have_fixed = (lo >> 8) & 1; + for (i = 0; i < num_var_ranges; i++) get_mtrr_var_range(i, &vrs[i]); - get_fixed_ranges(mtrr_state.fixed_ranges); + if (mtrr_state.have_fixed) + get_fixed_ranges(mtrr_state.fixed_ranges); rdmsr(MTRRdefType_MSR, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; + + if (mtrr_show) { + int high_width; + + printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + printk(KERN_INFO "MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + } + printk(KERN_INFO "MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); + high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) + printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + else + printk(KERN_INFO "MTRR %u disabled\n", i); + } + } } /* Some BIOS's are fucked and don't set all MTRRs the same! */ @@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) smp_processor_id(), msr, a, b); } -int generic_get_free_region(unsigned long base, unsigned long size) +int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) /* [SUMMARY] Get a free MTRR. The starting (base) address of the region. The size (in bytes) of the region. @@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; - unsigned long lbase; - unsigned lsize; + unsigned long lbase, lsize; max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) @@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size) } static void generic_get_mtrr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; @@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) return changed; } -static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) +static u32 deftype_lo, deftype_hi; + +static unsigned long set_mtrr_state(void) /* [SUMMARY] Set the MTRR state for this CPU. The MTRR state information to read. Some relevant CPU context. @@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; - if (set_fixed_ranges(mtrr_state.fixed_ranges)) + if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; /* Set_mtrr_restore restores the old value of MTRRdefType, so to set it we fiddle with the saved value */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) static unsigned long cr4 = 0; -static u32 deftype_lo, deftype_hi; static DEFINE_SPINLOCK(set_atomicity_lock); /* @@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); + mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -300,7 +354,7 @@ static void generic_set_all(void) prepare_set(); /* Actually set the state */ - mask = set_mtrr_state(deftype_lo,deftype_hi); + mask = set_mtrr_state(); post_set(); local_irq_restore(flags); @@ -374,7 +428,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i } } - if (base < 0x100) { + if (base + size < 0x100) { printk(KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", base, size); return -EINVAL; diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index 5ac051bb9d55..9753bc6a1f3f 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -17,7 +17,7 @@ extern unsigned int *usage_table; #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) -static char *mtrr_strings[MTRR_NUM_TYPES] = +static const char *const mtrr_strings[MTRR_NUM_TYPES] = { "uncachable", /* 0 */ "write-combining", /* 1 */ @@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] = "write-back", /* 6 */ }; -char *mtrr_attrib_to_str(int x) +const char *mtrr_attrib_to_str(int x) { return (x <= 6) ? mtrr_strings[x] : "?"; } @@ -155,6 +155,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; + unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; void __user *arg = (void __user *) __arg; @@ -235,15 +236,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC_GET_ENTRY: if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); + mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); /* Hide entries that go above 4GB */ - if (gentry.base + gentry.size > 0x100000 - || gentry.size == 0x100000) + if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { gentry.base <<= PAGE_SHIFT; - gentry.size <<= PAGE_SHIFT; + gentry.size = size << PAGE_SHIFT; gentry.type = type; } @@ -273,8 +274,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC_GET_PAGE_ENTRY: if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); - gentry.type = type; + mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + /* Hide entries that would overflow */ + if (size != (__typeof__(gentry.size))size) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.size = size; + gentry.type = type; + } break; } @@ -353,8 +360,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) char factor; int i, max, len; mtrr_type type; - unsigned long base; - unsigned int size; + unsigned long base, size; len = 0; max = num_var_ranges; @@ -373,7 +379,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, - "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", + "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, mtrr_attrib_to_str(type), usage_table[i]); } diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index a4de30b9d3d3..aeea23e8a050 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -172,6 +172,13 @@ static void ipi_handler(void *info) #endif +static inline int types_compatible(mtrr_type type1, mtrr_type type2) { + return type1 == MTRR_TYPE_UNCACHABLE || + type2 == MTRR_TYPE_UNCACHABLE || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || + (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); +} + /** * set_mtrr - update mtrrs on all processors * @reg: mtrr in question @@ -304,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base, int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, char increment) { - int i; + int i, replace, error; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; - int error; + unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; @@ -328,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size, return -ENOSYS; } + if (!size) { + printk(KERN_WARNING "mtrr: zero sized request\n"); + return -EINVAL; + } + if (base & size_or_mask || size & size_or_mask) { printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } error = -EINVAL; + replace = -1; /* No CPU hotplug when we change MTRR entries */ lock_cpu_hotplug(); @@ -341,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size, mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (base >= lbase + lsize) - continue; - if ((base < lbase) && (base + size <= lbase)) + if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) continue; /* At this point we know there is some kind of overlap/enclosure */ - if ((base < lbase) || (base + size > lbase + lsize)) { + if (base < lbase || base + size - 1 > lbase + lsize - 1) { + if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + /* New region encloses an existing region */ + if (type == ltype) { + replace = replace == -1 ? i : -2; + continue; + } + else if (types_compatible(type, ltype)) + continue; + } printk(KERN_WARNING "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%x000\n", base, size, lbase, + " 0x%lx000,0x%lx000\n", base, size, lbase, lsize); goto out; } /* New region is enclosed by an existing region */ if (ltype != type) { - if (type == MTRR_TYPE_UNCACHABLE) + if (types_compatible(type, ltype)) continue; printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", base, size, mtrr_attrib_to_str(ltype), @@ -368,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size, goto out; } /* Search for an empty MTRR */ - i = mtrr_if->get_free_region(base, size); + i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); - usage_table[i] = 1; + if (likely(replace < 0)) + usage_table[i] = 1; + else { + usage_table[i] = usage_table[replace] + !!increment; + if (unlikely(replace != i)) { + set_mtrr(replace, 0, 0, 0); + usage_table[replace] = 0; + } + } } else printk(KERN_INFO "mtrr: no more MTRRs available\n"); error = i; @@ -459,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; + unsigned long lbase, lsize; int error = -EINVAL; if (!mtrr_if) @@ -561,7 +586,7 @@ static void __init init_ifs(void) struct mtrr_value { mtrr_type ltype; unsigned long lbase; - unsigned int lsize; + unsigned long lsize; }; static struct mtrr_value * mtrr_state; diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index 99c9f2682041..d61ea9db6cfe 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -43,15 +43,16 @@ struct mtrr_ops { void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type); - int (*get_free_region) (unsigned long base, unsigned long size); - + unsigned long *size, mtrr_type * type); + int (*get_free_region)(unsigned long base, unsigned long size, + int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, unsigned int type); int (*have_wrcomb)(void); }; -extern int generic_get_free_region(unsigned long base, unsigned long size); +extern int generic_get_free_region(unsigned long base, unsigned long size, + int replace_reg); extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); @@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { unsigned long flags; - unsigned long deftype_lo; - unsigned long deftype_hi; unsigned long cr4val; - unsigned long ccr3; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; struct mtrr_var_range { - unsigned long base_lo; - unsigned long base_hi; - unsigned long mask_lo; - unsigned long mask_hi; + u32 base_lo; + u32 base_hi; + u32 mask_lo; + u32 mask_hi; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; void mtrr_state_warn(void); -char *mtrr_attrib_to_str(int x); +const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); -- cgit v1.2.1 From ba10650a880c2df23bd1db6c0570ddb66f389641 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: alloc_gdt() static Make the needlessly global alloc_gdt() static. (against) pda-percpu-init Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton --- arch/i386/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 68bcb687019a..1b34c56f8123 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -609,7 +609,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) return regs; } -__cpuinit int alloc_gdt(int cpu) +static __cpuinit int alloc_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); struct desc_struct *gdt; -- cgit v1.2.1 From 79929fd1c1887d2a057cbb80d487a2e2f1c01a02 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: Convert more absolute symbols to section relative o Convert more absolute symbols to section relative to keep the theme in vmlinux.lds.S file and to avoid problem if kernel is relocated. o Also put a message so that in future people can be aware of it and avoid introducing absolute symbols. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/vmlinux.lds.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 877dc5cfe3a8..25581e87c60d 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ * put it inside the section definition. */ +/* Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ #define LOAD_OFFSET __PAGE_OFFSET #include @@ -65,11 +71,11 @@ SECTIONS CONSTRUCTORS } :data - __start_paravirtprobe = .; .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) { + __start_paravirtprobe = .; *(.paravirtprobe) + __stop_paravirtprobe = .; } - __stop_paravirtprobe = .; . = ALIGN(4096); .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { @@ -172,11 +178,11 @@ SECTIONS *(.altinstr_replacement) } . = ALIGN(4); - __start_parainstructions = .; .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __start_parainstructions = .; *(.parainstructions) + __stop_parainstructions = .; } - __stop_parainstructions = .; /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } -- cgit v1.2.1 From 274e1bbdeeaf16e71418f11f5f305ab26061f2c2 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] x86: add write_pci_config_byte() to direct PCI access routines Mechanism of selecting physical mode in genapic when cpu hotplug is enabled on x86_64, broke the quirk(quirk_intel_irqbalance()) introduced for working around the transposing interrupt message errata in E7520/E7320/E7525 (revision ID 0x9 and below. errata #23 in http://download.intel.com/design/chipsets/specupdt/30304203.pdf). This errata requires the mode to be in logical flat, so that interrupts can be directed to more than one cpu(and thus use hardware IRQ balancing enabled by BIOS on these platforms). Following four patches fixes this by moving the quirk to early quirk and forcing the x86_64 genapic selection to logical flat on these platforms. Thanks to Shaohua for pointing out the breakage. This patch: Add write_pci_config_byte() to direct PCI access routines Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/pci/early.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/pci/early.c b/arch/i386/pci/early.c index 713d6c866cae..42df4b6606df 100644 --- a/arch/i386/pci/early.c +++ b/arch/i386/pci/early.c @@ -45,6 +45,13 @@ void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, outl(val, 0xcfc); } +void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) +{ + PDprintk("%x writing to %x: %x\n", slot, offset, val); + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + outb(val, 0xcfc); +} + int early_pci_allowed(void) { return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) == -- cgit v1.2.1 From fd6d7d26897dec834d0b9fbdc59819b0332a1257 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: introduce the mechanism of disabling cpu hotplug control Add 'enable_cpu_hotplug' flag and when cleared, the hotplug control file ("online") will not be added under /sys/devices/system/cpu/cpuX/ Next patch doing PCI quirks will use this. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/topology.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 07d6da36a825..844c08fdb225 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -40,14 +40,18 @@ int arch_register_cpu(int num) * restrictions and assumptions in kernel. This basically * doesnt add a control file, one cannot attempt to offline * BSP. + * + * Also certain PCI quirks require not to enable hotplug control + * for all CPU's. */ - if (!num) + if (!num || !enable_cpu_hotplug) cpu_devices[num].cpu.no_control = 1; return register_cpu(&cpu_devices[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU +int enable_cpu_hotplug = 1; void arch_unregister_cpu(int num) { return unregister_cpu(&cpu_devices[num].cpu); -- cgit v1.2.1 From 72486f1f8f0a2bc828b9d30cf4690cf2dd6807fc Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] i386: change the 'no_control' field to 'hotpluggable' in the struct cpu Change the 'no_control' field in the cpu struct to a more positive and better term 'hotpluggable'. And change(/cleanup) the logic accordingly. Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/topology.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 844c08fdb225..79cf608e14ca 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -44,8 +44,8 @@ int arch_register_cpu(int num) * Also certain PCI quirks require not to enable hotplug control * for all CPU's. */ - if (!num || !enable_cpu_hotplug) - cpu_devices[num].cpu.no_control = 1; + if (num && enable_cpu_hotplug) + cpu_devices[num].cpu.hotpluggable = 1; return register_cpu(&cpu_devices[num].cpu, num); } -- cgit v1.2.1 From b0d0a4ba45760b10ecee9035ed45b442c1a6cc84 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Thu, 7 Dec 2006 02:14:10 +0100 Subject: [PATCH] x86: fix the irqbalance quirk for E7320/E7520/E7525 Move the irqbalance quirks for E7320/E7520/E7525(Errata 23 in http://download.intel.com/design/chipsets/specupdt/30304203.pdf) to early quirks. And add a PCI quirk for these platforms to check(which happens very late during the boot) if the APIC routing is indeed set to default flat mode. This fixes the breakage(in x86_64) of this quirk due to cpu hotplug which selects physical mode instead of the logical flat(as needed for this errata workaround). Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/earlyquirk.c | 21 +++++++++++++++++ arch/i386/kernel/quirks.c | 46 +++++++++++++++++++++++++++++--------- arch/i386/kernel/smpboot.c | 7 ++++++ 3 files changed, 63 insertions(+), 11 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index c9841692bb7c..4b60af7f91dd 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI @@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device) return 0; } +static void check_intel(void) +{ + u16 vendor, device; + + vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); + + if (vendor != PCI_VENDOR_ID_INTEL) + return; + + device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); +#ifdef CONFIG_SMP + if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || + device == PCI_DEVICE_ID_INTEL_E7520_MCH || + device == PCI_DEVICE_ID_INTEL_E7525_MCH) + quirk_intel_irqbalance(); +#endif +} + void __init check_acpi_pci(void) { int num, slot, func; @@ -60,6 +79,8 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; + check_intel(); + /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 9f6ab1789bb0..a01320a7b636 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,10 +3,23 @@ */ #include #include +#include +#include +#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) +static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) +{ +#ifdef CONFIG_X86_64 + if (genapic != &apic_flat) + panic("APIC mode must be flat on this system\n"); +#elif defined(CONFIG_X86_GENERICARCH) + if (genapic != &apic_default) + panic("APIC mode must be default(flat) on this system. Use apic=default\n"); +#endif +} -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +void __init quirk_intel_irqbalance(void) { u8 config, rev; u32 word; @@ -16,18 +29,18 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); if (rev > 0x9) return; printk(KERN_INFO "Intel E7520/7320/7525 detected."); - /* enable access to config space*/ - pci_read_config_byte(dev, 0xf4, &config); - pci_write_config_byte(dev, 0xf4, config|0x2); + /* enable access to config space */ + config = read_pci_config_byte(0, 0, 0, 0xf4); + write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); /* read xTPR register */ - raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + word = read_pci_config_16(0, 0, 0x40, 0x4c); if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); @@ -37,14 +50,25 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; +#endif +#ifdef CONFIG_HOTPLUG_CPU + printk(KERN_INFO "Disabling cpu hotplug control\n"); + enable_cpu_hotplug = 0; +#endif +#ifdef CONFIG_X86_64 + /* force the genapic selection to flat mode so that + * interrupts can be redirected to more than one CPU. + */ + genapic_force = &apic_flat; #endif } - /* put back the original value for config space*/ + /* put back the original value for config space */ if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); + write_pci_config_byte(0, 0, 0, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); + #endif diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index cd7de9c9654b..346f27f4c79f 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -1482,6 +1483,12 @@ int __devinit __cpu_up(unsigned int cpu) cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + +#ifdef CONFIG_X86_GENERICARCH + if (num_online_cpus() > 8 && genapic == &apic_default) + panic("Default flat APIC routing can't be used with > 8 cpus\n"); +#endif + return 0; } -- cgit v1.2.1 From e1cccf48b182dd743c3c83a4fdf8dc570a43b393 Mon Sep 17 00:00:00 2001 From: Artiom Myaskouvskey Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: call efi_get_time during suspend Function efi_get_time called not only during init kernel phase but also during suspend (from get_cmos_time). When it is called from get_cmos_time the corresponding runtime service should be called in virtual and not in physical mode. Signed-off-by: Artiom Myaskouvskey Signed-off-by: Andi Kleen Cc: "Narayanan, Chandramouli" Cc: "Jiossy, Rami" Cc: "Satt, Shai" Cc: Andi Kleen Cc: Matt Domsch Signed-off-by: Andrew Morton --- arch/i386/kernel/efi.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 8b40648d0ef0..b92c7f0a358a 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime) return 0; } /* - * This should only be used during kernel init and before runtime - * services have been remapped, therefore, we'll need to call in physical - * mode. Note, this call isn't used later, so mark it __init. + * This is used during kernel init before runtime + * services have been remapped and also during suspend, therefore, + * we'll need to call both in physical and virtual modes. */ -inline unsigned long __init efi_get_time(void) +inline unsigned long efi_get_time(void) { efi_status_t status; efi_time_t eft; efi_time_cap_t cap; - status = phys_efi_get_time(&eft, &cap); + if (efi.get_time) { + /* if we are in virtual mode use remapped function */ + status = efi.get_time(&eft, &cap); + } else { + /* we are in physical mode */ + status = phys_efi_get_time(&eft, &cap); + } + if (status != EFI_SUCCESS) printk("Oops: efitime: can't read time status: 0x%lx\n",status); -- cgit v1.2.1 From 956fb53197f82257974f1f9835485aeeef4510b3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: handle a negative return value The Coverity checker noted that bad things might happen if find_isa_irq_apic() returned -1. [akpm@osdl.org: add debugging checks] Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Cc: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 993150f206ed..7bfd6c3ec87d 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -2179,9 +2179,15 @@ static inline void unlock_ExtINT_logic(void) unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); -- cgit v1.2.1 From 7e95b593a1aeb6fe1d3904e799d23a45261f2c19 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: Make irq_vector static irq_vector[] can now become static. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Acked-by: Eric W. Biederman Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 7bfd6c3ec87d..56f571c9fc0d 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1241,7 +1241,7 @@ static inline int IO_APIC_irq_trigger(int irq) } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; static int __assign_irq_vector(int irq) { -- cgit v1.2.1 From 538f188e03c821c93b355c9fc346806cdd34e286 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: i386 add Intel BTS cpufeature bit and detection (take 2) Here is a small patch for i386 which adds a cpufeature flag and detection code for Intel's Branch Trace Store (BTS) feature. This feature can be found on Intel P4 and Core 2 processors among others. It can also be used by perfmon. changelog: - add CPU_FEATURE_BTS - add Branch Trace Store detection signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 3ae795e9056d..56fe26584957 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -199,6 +199,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_bit(X86_FEATURE_BTS, c->x86_capability); if (!(l1 & (1<<12))) set_bit(X86_FEATURE_PEBS, c->x86_capability); } -- cgit v1.2.1 From 9a8cb626a08f2c8251291f3c0a049b29665895d2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: Avoid boot warning with apic=debug There are two consumers of apic=: the apic debug level and the low level generic architecture code. early_param would warn when the low level code rejected "debug". Avoid this. Signed-off-by: Andi Kleen --- arch/i386/mach-generic/probe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c index 94b1fd9cbe3c..a7b3999bb37a 100644 --- a/arch/i386/mach-generic/probe.c +++ b/arch/i386/mach-generic/probe.c @@ -45,7 +45,9 @@ static int __init parse_apic(char *arg) return 0; } } - return -ENOENT; + + /* Parsed again by __setup for debug/verbose */ + return 0; } early_param("apic", parse_apic); -- cgit v1.2.1 From f990fff427d68af3e4e1d16fe799c106abc0bf53 Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] x86: Regard MSRs in lapic_suspend()/lapic_resume() Read/Write APIC_LVTPC and APIC_LVTTHMR only, if get_maxlvt() returns certain values. This is done like everywhere else in i386/kernel/apic.c, so I guess its correct. Suspends/Resumes to disk fine and eleminates an smp_error_interrupt() here on a K8. AK: ported to x86-64 too Signed-off-by: Karsten Wiese Signed-off-by: Andi Kleen --- arch/i386/kernel/apic.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 2fd4b7d927c2..776d9be26af9 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -647,23 +647,30 @@ static struct { static int lapic_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); apic_pm_state.apic_spiv = apic_read(APIC_SPIV); apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif local_irq_save(flags); disable_local_APIC(); @@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev) { unsigned int l, h; unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + local_irq_save(flags); /* @@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); apic_write(APIC_TMICT, apic_pm_state.apic_tmict); -- cgit v1.2.1 From bf7e6a196318316e921f357557fca9d11d15f486 Mon Sep 17 00:00:00 2001 From: Artiom Myaskouvskey Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: Preserve EFI run time regions with memmap parameter When using memmap kernel parameter in EFI boot we should also add to memory map memory regions of runtime services to enable their mapping later. AK: merged and cleaned up the patch Signed-off-by: Artiom Myaskouvskey Signed-off-by: Andi Kleen --- arch/i386/kernel/e820.c | 60 +++++++++++++++++++++++++++++++++++-------------- arch/i386/mm/init.c | 2 -- 2 files changed, 43 insertions(+), 19 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index b704790f7969..2f7d0a92fd7c 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -742,29 +742,55 @@ void __init print_memory_map(char *who) } } -void __init limit_regions(unsigned long long size) +static __init __always_inline void efi_limit_regions(unsigned long long size) { unsigned long long current_addr = 0; + efi_memory_desc_t *md, *next_md; + void *p, *p1; + int i, j; + + j = 0; + p1 = memmap.map; + for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + md = p; + next_md = p1; + current_addr = md->phys_addr + + PFN_PHYS(md->num_pages); + if (is_available_memory(md)) { + if (md->phys_addr >= size) continue; + memcpy(next_md, md, memmap.desc_size); + if (current_addr >= size) { + next_md->num_pages -= + PFN_UP(current_addr-size); + } + p1 += memmap.desc_size; + next_md = p1; + j++; + } else if ((md->attribute & EFI_MEMORY_RUNTIME) == + EFI_MEMORY_RUNTIME) { + /* In order to make runtime services + * available we have to include runtime + * memory regions in memory map */ + memcpy(next_md, md, memmap.desc_size); + p1 += memmap.desc_size; + next_md = p1; + j++; + } + } + memmap.nr_map = j; + memmap.map_end = memmap.map + + (memmap.nr_map * memmap.desc_size); +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr; int i; print_memory_map("limit_regions start"); if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map, i = 0; p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - current_addr = md->phys_addr + (md->num_pages << 12); - if (md->type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - md->num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } + efi_limit_regions(size); + return; } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 167416155ee4..f4dd048187ff 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -192,8 +192,6 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } -extern int is_available_memory(efi_memory_desc_t *); - int page_is_ram(unsigned long pagenr) { int i; -- cgit v1.2.1 From 6df0532eef0187c293d3ab1d4c158f92e8f24f8a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] i386: remove duplicate printk We do the exact same printk about a dozen lines above with no intermediate printk's. Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/amd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index e4758095d87a..41cfea57232b 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) f_vide(); rdtscl(d2); d = d2-d; - - /* Knock these two lines out if it debugs out ok */ - printk(KERN_INFO "AMD K6 stepping B detected - "); - /* -- cut here -- */ + if (d > 20*K6_BUG_LOOP) printk("system stability may be impaired when more than 32 MB are used.\n"); else -- cgit v1.2.1 From 0741f4d207a644482d7a040f05cd264c98cf7ee8 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Thu, 7 Dec 2006 02:14:11 +0100 Subject: [PATCH] x86: add sysctl for kstack_depth_to_print Add sysctl for kstack_depth_to_print. This lets users change the amount of raw stack data printed in dump_stack() without having to reboot. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 7b2f9f022089..1d48a75fa338 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -91,7 +91,7 @@ asmlinkage void alignment_check(void); asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); -static int kstack_depth_to_print = 24; +int kstack_depth_to_print = 24; #ifdef CONFIG_STACK_UNWIND static int call_trace = 1; #else -- cgit v1.2.1 From a36df98ab1cdd8a9e7daa4c1b5c48ffa2ad6ea09 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Dec 2006 02:14:12 +0100 Subject: [PATCH] i386: touch softlockup during backtracing Sometimes the soft watchdog fires after we're done oopsing. See http://projects.info-pull.com/mokb/MOKB-25-11-2006.html for an example. AK: changed to touch_nmi_watchdog() Signed-off-by: Dave Jones Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 1d48a75fa338..86d8476be4fe 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_EISA #include @@ -248,6 +249,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long*)context->previous_esp; if (!stack) break; + touch_nmi_watchdog(); } } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.1 From a1a70c25bed75ed36ed48bbe18b9029428d2452d Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:12 +0100 Subject: [PATCH] i386: always enable regparm -mregparm=3 has been enabled by default for some time on i386, and AFAIK there aren't any problems with it left. This patch removes the REGPARM config option and sets -mregparm=3 unconditionally. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/Kconfig | 14 -------------- arch/i386/Makefile | 4 +--- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index bb1fa061c6cf..b6b2df40ca78 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -721,20 +721,6 @@ config BOOT_IOREMAP depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) default y -config REGPARM - bool "Use register arguments" - default y - help - Compile the kernel with -mregparm=3. This instructs gcc to use - a more efficient function call ABI which passes the first three - arguments of a function call via registers, which results in denser - and faster code. - - If this option is disabled, then the default ABI of passing - arguments via the stack is used. - - If unsure, say Y. - config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS diff --git a/arch/i386/Makefile b/arch/i386/Makefile index d1aca52bf690..f7ac1aea1d8a 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs endif CHECKFLAGS += -D__i386__ -CFLAGS += -pipe -msoft-float +CFLAGS += -pipe -msoft-float -mregparm=3 # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) @@ -39,8 +39,6 @@ CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/i386/Makefile.cpu -cflags-$(CONFIG_REGPARM) += -mregparm=3 - # temporary until string.h is fixed cflags-y += -ffreestanding -- cgit v1.2.1 From 359ad0d4015a9ab39243f2ebc4eb07915bd618b2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] unwinder: more sanity checks in Dwarf2 unwinder Tighten the requirements on both input to and output from the Dwarf2 unwinder. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/traps.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 86d8476be4fe..c447807e2a45 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -161,12 +161,19 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data) { struct ops_and_data *oad = (struct ops_and_data *)data; int n = 0; + unsigned long sp = UNW_SP(info); + if (arch_unw_user_mode(info)) + return -1; while (unwind(info) == 0 && UNW_PC(info)) { n++; oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); } return n; } -- cgit v1.2.1 From d5d2448d896fbb9a427ee12eb8e5f6309f2473f7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] x86-64: Fix numaq build error CC arch/i386/boot/compressed/misc.o arch/i386/boot/compressed/misc.c:120: error: static declaration of 'xquad_portio' follows non-static declaration include/asm/io.h:275: error: previous declaration of 'xquad_portio' was here make[2]: *** [arch/i386/boot/compressed/misc.o] Error 1 Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/boot/compressed/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index c6798c75c67d..1ce7017fd627 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -196,7 +196,7 @@ static int vidport; static int lines, cols; #ifdef CONFIG_X86_NUMAQ -static void * xquad_portio = NULL; +void *xquad_portio; #endif #include "../../../../lib/inflate.c" -- cgit v1.2.1 From 9cfa5b5dfafcfe64c1a48906f243cdd302f82471 Mon Sep 17 00:00:00 2001 From: Burman Yan Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] x86-64: replace kmalloc+memset with kzalloc in MTRR code Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/mtrr/if.c | 3 +-- arch/i386/kernel/cpu/mtrr/main.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c index 9753bc6a1f3f..5ae1705eafa6 100644 --- a/arch/i386/kernel/cpu/mtrr/if.c +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size, max = num_var_ranges; if (fcount == NULL) { - fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); + fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL); if (!fcount) return -ENOMEM; - memset(fcount, 0, max * sizeof *fcount); FILE_FCOUNT(file) = fcount; } if (!page) { diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index aeea23e8a050..16bb7ea87145 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -596,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state) int i; int size = num_var_ranges * sizeof(struct mtrr_value); - mtrr_state = kmalloc(size,GFP_ATOMIC); - if (mtrr_state) - memset(mtrr_state,0,size); - else + mtrr_state = kzalloc(size,GFP_ATOMIC); + if (!mtrr_state) return -ENOMEM; for (i = 0; i < num_var_ranges; i++) { -- cgit v1.2.1 From f475ff352c5e05d473c462b97c3a13a5b803af5a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 7 Dec 2006 02:14:13 +0100 Subject: [PATCH] x86-64: remove unused variable Remove unused variable in msr_write(). Reported by D Binderman . Cc: H. Peter Anvin Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/msr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index a773f776c9ea..fd45059c9084 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -195,7 +195,6 @@ static ssize_t msr_write(struct file *file, const char __user *buf, { const u32 __user *tmp = (const u32 __user *)buf; u32 data[2]; - size_t rv; u32 reg = *ppos; int cpu = iminor(file->f_dentry->d_inode); int err; @@ -203,7 +202,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (count % 8) return -EINVAL; /* Invalid chunk size */ - for (rv = 0; count; count -= 8) { + for (; count; count -= 8) { if (copy_from_user(&data, tmp, 8)) return -EFAULT; err = do_wrmsr(cpu, reg, data[0], data[1]); -- cgit v1.2.1 From d7fb02712818643bab79a6b3cb8270a747d0227b Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] x86-64: remove remaining pc98 code Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 23 ++--------------------- arch/i386/kernel/mpparse.c | 2 -- 2 files changed, 2 insertions(+), 23 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 56f571c9fc0d..7f015a71ab55 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -842,8 +842,7 @@ static int __init find_isa_irq_pin(int irq, int type) if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -862,8 +861,7 @@ static int __init find_isa_irq_apic(int irq, int type) if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + mp_bus_id_to_type[lbus] == MP_BUS_MCA ) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -993,12 +991,6 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) (0) -/* NEC98 interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_NEC98_trigger(idx) (0) -#define default_NEC98_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -1033,11 +1025,6 @@ static int __init MPBIOS_polarity(int idx) polarity = default_MCA_polarity(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - polarity = default_NEC98_polarity(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1107,11 +1094,6 @@ static int MPBIOS_trigger(int idx) trigger = default_MCA_trigger(idx); break; } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - trigger = default_NEC98_trigger(idx); - break; - } default: { printk(KERN_WARNING "broken BIOS!!\n"); @@ -1173,7 +1155,6 @@ static int pin_2_irq(int idx, int apic, int pin) case MP_BUS_ISA: /* ISA pin */ case MP_BUS_EISA: case MP_BUS_MCA: - case MP_BUS_NEC98: { irq = mp_irqs[idx].mpc_srcbusirq; break; diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 442aaf8c77eb..2ce67228dff8 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m) mp_current_pci_id++; } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; } else { printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } -- cgit v1.2.1 From 116780fc04d9f6cd3ceeab0251681f1dfda53367 Mon Sep 17 00:00:00 2001 From: Burman Yan Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] i386: replace kmalloc+memset with kzalloc Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/intel_cacheinfo.c | 11 +++-------- arch/i386/kernel/mca.c | 13 ++++--------- arch/i386/kernel/pci-dma.c | 6 ++---- arch/i386/mach-voyager/voyager_cat.c | 6 ++---- 4 files changed, 11 insertions(+), 25 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index 5c43be47587f..80b4c5d421b1 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) if (num_cache_leaves == 0) return -ENOENT; - cpuid4_info[cpu] = kmalloc( + cpuid4_info[cpu] = kzalloc( sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); if (unlikely(cpuid4_info[cpu] == NULL)) return -ENOMEM; - memset(cpuid4_info[cpu], 0, - sizeof(struct _cpuid4_info) * num_cache_leaves); oldmask = current->cpus_allowed; retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); @@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) return -ENOENT; /* Allocate all required memory */ - cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); + cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL); if (unlikely(cache_kobject[cpu] == NULL)) goto err_out; - memset(cache_kobject[cpu], 0, sizeof(struct kobject)); - index_kobject[cpu] = kmalloc( + index_kobject[cpu] = kzalloc( sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); if (unlikely(index_kobject[cpu] == NULL)) goto err_out; - memset(index_kobject[cpu], 0, - sizeof(struct _index_kobject) * num_cache_leaves); return 0; diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index eb57a851789d..b83672b89527 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -283,10 +283,9 @@ static int __init mca_init(void) bus->f.mca_transform_memory = mca_dummy_transform_memory; /* get the motherboard device */ - mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); if(unlikely(!mca_dev)) goto out_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); /* * We do not expect many MCA interrupts during initialization, @@ -310,11 +309,9 @@ static int __init mca_init(void) mca_dev->slot = MCA_MOTHERBOARD; mca_register_device(MCA_PRIMARY_BUS, mca_dev); - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); - /* Put motherboard into video setup mode, read integrated video * POS registers, and turn motherboard setup off. @@ -349,10 +346,9 @@ static int __init mca_init(void) } if(which_scsi) { /* found a scsi card */ - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); for(j = 0; j < 8; j++) mca_dev->pos[j] = pos[j]; @@ -378,10 +374,9 @@ static int __init mca_init(void) if(!mca_read_and_store_pos(pos)) continue; - mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); if(unlikely(!mca_dev)) goto out_unlock_nomem; - memset(mca_dev, 0, sizeof(struct mca_device)); for(j=0; j<8; j++) mca_dev->pos[j]=pos[j]; diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c index 5c8c6ef1fc5e..41af692c1584 100644 --- a/arch/i386/kernel/pci-dma.c +++ b/arch/i386/kernel/pci-dma.c @@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, if (!mem_base) goto out; - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); if (!dev->dma_mem) goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dev->dma_mem->bitmap) goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); dev->dma_mem->virt_base = mem_base; dev->dma_mem->device_base = device_addr; diff --git a/arch/i386/mach-voyager/voyager_cat.c b/arch/i386/mach-voyager/voyager_cat.c index f50c6c6ad680..943a9473b138 100644 --- a/arch/i386/mach-voyager/voyager_cat.c +++ b/arch/i386/mach-voyager/voyager_cat.c @@ -776,7 +776,7 @@ voyager_cat_init(void) for(asic=0; asic < (*modpp)->num_asics; asic++) { int j; voyager_asic_t *asicp = *asicpp - = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ + = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ voyager_sp_table_t *sp_table; voyager_at_t *asic_table; voyager_jtt_t *jtag_table; @@ -785,7 +785,6 @@ voyager_cat_init(void) printk("**WARNING** kmalloc failure in cat_init\n"); continue; } - memset(asicp, 0, sizeof(voyager_asic_t)); asicpp = &(asicp->next); asicp->asic_location = asic; sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); @@ -851,8 +850,7 @@ voyager_cat_init(void) #endif { - struct resource *res = kmalloc(sizeof(struct resource),GFP_KERNEL); - memset(res, 0, sizeof(struct resource)); + struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); res->name = kmalloc(128, GFP_KERNEL); sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); res->start = qic_addr; -- cgit v1.2.1 From f6ca8083c261864fc9de94ef99c3311ea259c5c3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] x86-64: Make ix86 default to HIGHMEM4G instead of NOHIGHMEM Generally better for allmodconfig coverage. Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index b6b2df40ca78..ea70359b02d0 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -454,7 +454,8 @@ source "drivers/firmware/Kconfig" choice prompt "High Memory Support" - default NOHIGHMEM + default HIGHMEM4G if !X86_NUMAQ + default HIGHMEM64G if X86_NUMAQ config NOHIGHMEM bool "off" -- cgit v1.2.1 From 6bedb2ccb02dcc70ffc8eb76df71c746378190ad Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] x86-64: don't use set_irq_regs() We don't need to setup _irq_regs in smp_xxx_interrupt (except apic timer). These handlers run with irqs disabled and do not call functions which need "struct pt_regs". Signed-off-by: Oleg Nesterov Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Cc: Andi Kleen Acked-By: David Howells Signed-off-by: Andrew Morton --- arch/i386/kernel/smp.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 31e5c6573aae..1b080ab8a49f 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu) fastcall void smp_invalidate_interrupt(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); unsigned long cpu; cpu = get_cpu(); @@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs) smp_mb__after_clear_bit(); out: put_cpu_no_resched(); - set_irq_regs(old_regs); } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, @@ -607,14 +605,11 @@ void smp_send_stop(void) */ fastcall void smp_reschedule_interrupt(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); ack_APIC_irq(); - set_irq_regs(old_regs); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; @@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) mb(); atomic_inc(&call_data->finished); } - set_irq_regs(old_regs); } /* -- cgit v1.2.1 From b65780e123ba9b762276482bbfb52836e4d41fd9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] unwinder: move .eh_frame to RODATA The .eh_frame section contents is never written to, so it can as well benefit from CONFIG_DEBUG_RODATA. Diff-ed against firstfloor tree. Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 25581e87c60d..56e6ad5cb045 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -102,15 +102,6 @@ SECTIONS _edata = .; /* End of data section */ } -#ifdef CONFIG_STACK_UNWIND - . = ALIGN(4); - .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { - __start_unwind = .; - *(.eh_frame) - __end_unwind = .; - } -#endif - . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) -- cgit v1.2.1 From d9408cefe677636bc1c100fdcfac0b2ab9ff87bf Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 7 Dec 2006 02:14:19 +0100 Subject: [PATCH] i386: Clean up smp_tune_scheduling() - remove the write-only local variable "bandwidth" - don't set "max_cache_size" in the (cachesize < 0) case: that's already handled in kernel/sched.c:measure_migration_cost() Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/i386/kernel/smpboot.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 346f27f4c79f..b4e6f32de453 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1130,34 +1130,15 @@ exit: } #endif -static void smp_tune_scheduling (void) +static void smp_tune_scheduling(void) { unsigned long cachesize; /* kB */ - unsigned long bandwidth = 350; /* MB/s */ - /* - * Rough estimation for SMP scheduling, this is the number of - * cycles it takes for a fully memory-limited process to flush - * the SMP-local cache. - * - * (For a P5 this pretty much means we will choose another idle - * CPU almost always at wakeup time (this is due to the small - * L1 cache), on PIIs it's around 50-100 usecs, depending on - * the cache size) - */ - if (!cpu_khz) { - /* - * this basically disables processor-affinity - * scheduling on SMP without a TSC. - */ - return; - } else { + if (cpu_khz) { cachesize = boot_cpu_data.x86_cache_size; - if (cachesize == -1) { - cachesize = 16; /* Pentiums, 2x8kB cache */ - bandwidth = 100; - } - max_cache_size = cachesize * 1024; + + if (cachesize > 0) + max_cache_size = cachesize * 1024; } } -- cgit v1.2.1 From 39dde65c9940c97fcd178a3d2b1c57ed8b7b68aa Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Wed, 6 Dec 2006 20:32:03 -0800 Subject: [PATCH] shared page table for hugetlb page Following up with the work on shared page table done by Dave McCracken. This set of patch target shared page table for hugetlb memory only. The shared page table is particular useful in the situation of large number of independent processes sharing large shared memory segments. In the normal page case, the amount of memory saved from process' page table is quite significant. For hugetlb, the saving on page table memory is not the primary objective (as hugetlb itself already cuts down page table overhead significantly), instead, the purpose of using shared page table on hugetlb is to allow faster TLB refill and smaller cache pollution upon TLB miss. With PT sharing, pte entries are shared among hundreds of processes, the cache consumption used by all the page table is smaller and in return, application gets much higher cache hit ratio. One other effect is that cache hit ratio with hardware page walker hitting on pte in cache will be higher and this helps to reduce tlb miss latency. These two effects contribute to higher application performance. Signed-off-by: Ken Chen Acked-by: Hugh Dickins Cc: Dave McCracken Cc: William Lee Irwin III Cc: "Luck, Tony" Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Gibson Cc: Adam Litke Cc: Paul Mundt Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/hugetlbpage.c | 112 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index 1719a8141f81..34728e4afe48 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -17,6 +17,113 @@ #include #include +static unsigned long page_table_shareable(struct vm_area_struct *svma, + struct vm_area_struct *vma, + unsigned long addr, pgoff_t idx) +{ + unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + + svma->vm_start; + unsigned long sbase = saddr & PUD_MASK; + unsigned long s_end = sbase + PUD_SIZE; + + /* + * match the virtual addresses, permission and the alignment of the + * page table page. + */ + if (pmd_index(addr) != pmd_index(saddr) || + vma->vm_flags != svma->vm_flags || + sbase < svma->vm_start || svma->vm_end < s_end) + return 0; + + return saddr; +} + +static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long base = addr & PUD_MASK; + unsigned long end = base + PUD_SIZE; + + /* + * check on proper vm_flags and page table alignment + */ + if (vma->vm_flags & VM_MAYSHARE && + vma->vm_start <= base && end <= vma->vm_end) + return 1; + return 0; +} + +/* + * search for a shareable pmd page for hugetlb. + */ +static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + struct vm_area_struct *vma = find_vma(mm, addr); + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + struct prio_tree_iter iter; + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; + + if (!vma_shareable(vma, addr)) + return; + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { + if (svma == vma) + continue; + + saddr = page_table_shareable(svma, vma, addr, idx); + if (saddr) { + spte = huge_pte_offset(svma->vm_mm, saddr); + if (spte) { + get_page(virt_to_page(spte)); + break; + } + } + } + + if (!spte) + goto out; + + spin_lock(&mm->page_table_lock); + if (pud_none(*pud)) + pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); + else + put_page(virt_to_page(spte)); + spin_unlock(&mm->page_table_lock); +out: + spin_unlock(&mapping->i_mmap_lock); +} + +/* + * unmap huge page backed by shared pte. + * + * Hugetlb pte page is ref counted at the time of mapping. If pte is shared + * indicated by page_count > 1, unmap is achieved by clearing pud and + * decrementing the ref count. If count == 1, the pte page is not shared. + * + * called with vma->vm_mm->page_table_lock held. + * + * returns: 1 successfully unmapped a shared pte page + * 0 the underlying pte page is not shared, or it is the last user + */ +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + pgd_t *pgd = pgd_offset(mm, *addr); + pud_t *pud = pud_offset(pgd, *addr); + + BUG_ON(page_count(virt_to_page(ptep)) == 0); + if (page_count(virt_to_page(ptep)) == 1) + return 0; + + pud_clear(pud); + put_page(virt_to_page(ptep)); + *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; + return 1; +} + pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); - if (pud) + if (pud) { + if (pud_none(*pud)) + huge_pmd_share(mm, addr, pud); pte = (pte_t *) pmd_alloc(mm, pud, addr); + } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); return pte; -- cgit v1.2.1 From a866374aecc90c7d90619727ccd851ac096b2fc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:32:20 -0800 Subject: [PATCH] mm: pagefault_{disable,enable}() Introduce pagefault_{disable,enable}() and use these where previously we did manual preempt increments/decrements to make the pagefault handler do the atomic thing. Currently they still rely on the increased preempt count, but do not rely on the disabled preemption, this might go away in the future. (NOTE: the extra barrier() in pagefault_disable might fix some holes on machines which have too many registers for their own good) [heiko.carstens@de.ibm.com: s390 fix] Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/highmem.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index f9f647cdbc7b..178bbfe6cbac 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -32,7 +32,7 @@ void *kmap_atomic(struct page *page, enum km_type type) unsigned long vaddr; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - inc_preempt_count(); + pagefault_disable(); if (!PageHighMem(page)) return page_address(page); @@ -52,8 +52,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #ifdef CONFIG_DEBUG_HIGHMEM if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) { - dec_preempt_count(); - preempt_check_resched(); + pagefault_enable(); return; } @@ -68,8 +67,7 @@ void kunmap_atomic(void *kvaddr, enum km_type type) */ kpte_clear_flush(kmap_pte-idx, vaddr); - dec_preempt_count(); - preempt_check_resched(); + pagefault_enable(); } /* This is the same as kmap_atomic() but can map memory that doesn't @@ -80,7 +78,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) enum fixed_addresses idx; unsigned long vaddr; - inc_preempt_count(); + pagefault_disable(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -- cgit v1.2.1 From 3b17979bda74493633364c2c263b452b7788e350 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 6 Dec 2006 20:32:22 -0800 Subject: [PATCH] Fix kunmap_atomic's use of kpte_clear_flush() kunmap_atomic() will call kpte_clear_flush with vaddr/ptep arguments which don't correspond if the vaddr is just a normal lowmem address (ie, not in the KMAP area). This patch makes sure that the pte is only cleared if kmap area was actually used for the mapping. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Rusty Russell Cc: Zachary Amsden Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/highmem.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c index 178bbfe6cbac..e0fa6cb655a8 100644 --- a/arch/i386/mm/highmem.c +++ b/arch/i386/mm/highmem.c @@ -50,22 +50,20 @@ void kunmap_atomic(void *kvaddr, enum km_type type) unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); -#ifdef CONFIG_DEBUG_HIGHMEM - if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) { - pagefault_enable(); - return; - } - - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); -#endif /* * Force other mappings to Oops if they'll try to access this pte * without first remap it. Keeping stale mappings around is a bad idea * also, in case the page changes cacheability attributes or becomes * a protected page in a hypervisor. */ - kpte_clear_flush(kmap_pte-idx, vaddr); + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + kpte_clear_flush(kmap_pte-idx, vaddr); + else { +#ifdef CONFIG_DEBUG_HIGHMEM + BUG_ON(vaddr < PAGE_OFFSET); + BUG_ON(vaddr >= (unsigned long)high_memory); +#endif + } pagefault_enable(); } -- cgit v1.2.1 From a120586873d3d64de93bd6d593d237e131994e58 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 6 Dec 2006 20:32:37 -0800 Subject: [PATCH] Allow NULL pointers in percpu_free The patch (as824b) makes percpu_free() ignore NULL arguments, as one would expect for a deallocation routine. (Note that free_percpu is #defined as percpu_free in include/linux/percpu.h.) A few callers are updated to remove now-unneeded tests for NULL. A few other callers already seem to assume that passing a NULL pointer to percpu_free() is okay! The patch also removes an unnecessary NULL check in percpu_depopulate(). Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/acpi/cstate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c index 4664b55f623e..12e937c1ce4b 100644 --- a/arch/i386/kernel/acpi/cstate.c +++ b/arch/i386/kernel/acpi/cstate.c @@ -156,10 +156,8 @@ static int __init ffh_cstate_init(void) static void __exit ffh_cstate_exit(void) { - if (cpu_cstate_entry) { - free_percpu(cpu_cstate_entry); - cpu_cstate_entry = NULL; - } + free_percpu(cpu_cstate_entry); + cpu_cstate_entry = NULL; } arch_initcall(ffh_cstate_init); -- cgit v1.2.1 From e94b1766097d53e6f3ccfb36c8baa562ffeda3fc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:33:17 -0800 Subject: [PATCH] slab: remove SLAB_KERNEL SLAB_KERNEL is an alias of GFP_KERNEL. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/sysenter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 713ba39d32c6..0bbacd0ec175 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -132,7 +132,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) goto up_fail; } - vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) { ret = -ENOMEM; goto up_fail; -- cgit v1.2.1 From e18b890bb0881bbab6f4f1a6cd20d9c60d66b003 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 6 Dec 2006 20:33:20 -0800 Subject: [PATCH] slab: remove kmem_cache_t Replace all uses of kmem_cache_t with struct kmem_cache. The patch was generated using the following script: #!/bin/sh # # Replace one string by another in all the kernel sources. # set -e for file in `find * -name "*.c" -o -name "*.h"|xargs grep -l $1`; do quilt add $file sed -e "1,\$s/$1/$2/g" $file >/tmp/$$ mv /tmp/$$ $file quilt refresh done The script was run like this sh replace kmem_cache_t "struct kmem_cache" Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/init.c | 4 ++-- arch/i386/mm/pgtable.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 167416155ee4..a6a8e397dd88 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -699,8 +699,8 @@ int remove_memory(u64 start, u64 size) #endif #endif -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; +struct kmem_cache *pgd_cache; +struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 10126e3f8174..33be236fc6af 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -193,7 +193,7 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags) { memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } @@ -233,7 +233,7 @@ static inline void pgd_list_del(pgd_t *pgd) set_page_private(next, (unsigned long)pprev); } -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; @@ -253,7 +253,7 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) } /* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ -- cgit v1.2.1 From 7dfb71030f7636a0d65200158113c37764552f93 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 6 Dec 2006 20:34:23 -0800 Subject: [PATCH] Add include/linux/freezer.h and move definitions from sched.h Move process freezing functions from include/linux/sched.h to freezer.h, so that modifications to the freezer or the kernel configuration don't require recompiling just about everything. [akpm@osdl.org: fix ueagle driver] Signed-off-by: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 3b7a63e0ed1a..44c5a3206b2a 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include -- cgit v1.2.1 From 2d4a34c9365c6e3f94a5b26ce296e1fce9b66c8b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 6 Dec 2006 20:34:29 -0800 Subject: [PATCH] swsusp: Support i386 systems with PAE or without PSE Make swsusp support i386 systems with PAE or without PSE. This is done by creating temporary page tables located in resume-safe page frames before the suspend image is restored in the same way as x86_64 does it. Signed-off-by: Rafael J. Wysocki Cc: Andi Kleen Cc: Dave Jones Cc: Nigel Cunningham Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/power/Makefile | 2 +- arch/i386/power/suspend.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++ arch/i386/power/swsusp.S | 9 ++- 3 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 arch/i386/power/suspend.c (limited to 'arch/i386') diff --git a/arch/i386/power/Makefile b/arch/i386/power/Makefile index 8cfa4e8a719d..2de7bbf03cd7 100644 --- a/arch/i386/power/Makefile +++ b/arch/i386/power/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_PM) += cpu.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o suspend.o diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c new file mode 100644 index 000000000000..db5e98d2eb73 --- /dev/null +++ b/arch/i386/power/suspend.c @@ -0,0 +1,158 @@ +/* + * Suspend support specific for i386 - temporary page tables + * + * Distribute under GPLv2 + * + * Copyright (c) 2006 Rafael J. Wysocki + */ + +#include +#include + +#include +#include +#include + +/* Defined in arch/i386/power/swsusp.S */ +extern int restore_image(void); + +/* Pointer to the temporary resume page tables */ +pgd_t *resume_pg_dir; + +/* The following three functions are based on the analogous code in + * arch/i386/mm/init.c + */ + +/* + * Create a middle page table on a resume-safe page and put a pointer to it in + * the given global directory entry. This only returns the gd entry + * in non-PAE compilation mode, since the middle layer is folded. + */ +static pmd_t *resume_one_md_table_init(pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd_table; + +#ifdef CONFIG_X86_PAE + pmd_table = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!pmd_table) + return NULL; + + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + pud = pud_offset(pgd, 0); + + BUG_ON(pmd_table != pmd_offset(pud, 0)); +#else + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); +#endif + + return pmd_table; +} + +/* + * Create a page table on a resume-safe page and place a pointer to it in + * a middle page directory entry. + */ +static pte_t *resume_one_page_table_init(pmd_t *pmd) +{ + if (pmd_none(*pmd)) { + pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!page_table) + return NULL; + + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); + + BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + + return page_table; + } + + return pte_offset_kernel(pmd, 0); +} + +/* + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET. The page tables are allocated out of resume-safe pages. + */ +static int resume_physical_mapping_init(pgd_t *pgd_base) +{ + unsigned long pfn; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int pgd_idx, pmd_idx; + + pgd_idx = pgd_index(PAGE_OFFSET); + pgd = pgd_base + pgd_idx; + pfn = 0; + + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { + pmd = resume_one_md_table_init(pgd); + if (!pmd) + return -ENOMEM; + + if (pfn >= max_low_pfn) + continue; + + for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) { + if (pfn >= max_low_pfn) + break; + + /* Map with big pages if possible, otherwise create + * normal page tables. + * NOTE: We can mark everything as executable here + */ + if (cpu_has_pse) { + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + pfn += PTRS_PER_PTE; + } else { + pte_t *max_pte; + + pte = resume_one_page_table_init(pmd); + if (!pte) + return -ENOMEM; + + max_pte = pte + PTRS_PER_PTE; + for (; pte < max_pte; pte++, pfn++) { + if (pfn >= max_low_pfn) + break; + + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + } + } + } + } + return 0; +} + +static inline void resume_init_first_level_page_table(pgd_t *pg_dir) +{ +#ifdef CONFIG_X86_PAE + int i; + + /* Init entries of the first-level page table to the zero page */ + for (i = 0; i < PTRS_PER_PGD; i++) + set_pgd(pg_dir + i, + __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); +#endif +} + +int swsusp_arch_resume(void) +{ + int error; + + resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC); + if (!resume_pg_dir) + return -ENOMEM; + + resume_init_first_level_page_table(resume_pg_dir); + error = resume_physical_mapping_init(resume_pg_dir); + if (error) + return error; + + /* We have got enough memory and from now on we cannot recover */ + restore_image(); + return 0; +} diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S index 8a2b50a0aaad..53662e05b393 100644 --- a/arch/i386/power/swsusp.S +++ b/arch/i386/power/swsusp.S @@ -28,8 +28,9 @@ ENTRY(swsusp_arch_suspend) call swsusp_save ret -ENTRY(swsusp_arch_resume) - movl $swsusp_pg_dir-__PAGE_OFFSET, %ecx +ENTRY(restore_image) + movl resume_pg_dir, %ecx + subl $__PAGE_OFFSET, %ecx movl %ecx, %cr3 movl restore_pblist, %edx @@ -51,6 +52,10 @@ copy_loop: .p2align 4,,7 done: + /* go back to the original page tables */ + movl $swapper_pg_dir, %ecx + subl $__PAGE_OFFSET, %ecx + movl %ecx, %cr3 /* Flush TLB, including "global" things (vmalloc) */ movl mmu_cr4_features, %eax movl %eax, %edx -- cgit v1.2.1 From 6cfd76a26d9fe2ba54b9d496a48c1d9285e5c5ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Dec 2006 20:37:22 -0800 Subject: [PATCH] lockdep: name some old style locks Name some of the remaning 'old_style_spin_init' locks Signed-off-by: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index fe9c5e8e7e6f..3124f1b04d67 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -452,7 +452,7 @@ void die(const char * str, struct pt_regs * regs, long err) u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; -- cgit v1.2.1 From 19e5d9c0d2194b4b47189cbec2921cbf72b0bd1c Mon Sep 17 00:00:00 2001 From: Henry Nestler Date: Wed, 6 Dec 2006 20:37:45 -0800 Subject: [PATCH] initrd: remove unused false condition for initrd_start After LOADER_TYPE && INITRD_START are true, the short if-condition for INITRD_START can never be false. Remove unused code from the else condition. Signed-off-by: Henry Nestler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/setup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 141041dde74d..97bb869307bc 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1162,8 +1162,7 @@ void __init setup_bootmem_allocator(void) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { reserve_bootmem(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; } else { -- cgit v1.2.1 From b4c6c34a530b4d1c626f4ac0a884e0a9b849378c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 6 Dec 2006 20:38:11 -0800 Subject: [PATCH] kprobes: enable booster on the preemptible kernel When we are unregistering a kprobe-booster, we can't release its instruction buffer immediately on the preemptive kernel, because some processes might be preempted on the buffer. The freeze_processes() and thaw_processes() functions can clean most of processes up from the buffer. There are still some non-frozen threads who have the PF_NOFREEZE flag. If those threads are sleeping (not preempted) at the known place outside the buffer, we can ensure safety of freeing. However, the processing of this check routine takes a long time. So, this patch introduces the garbage collection mechanism of insn_slot. It also introduces the "dirty" flag to free_insn_slot because of efficiency. The "clean" instruction slots (dirty flag is cleared) are released immediately. But the "dirty" slots which are used by boosted kprobes, are marked as garbages. collect_garbage_slots() will be invoked to release "dirty" slots if there are more than INSNS_PER_PAGE garbage slots or if there are no unused slots. Cc: "Keshavamurthy, Anil S" Cc: Ananth N Mavinakayanahalli Cc: "bibo,mao" Cc: Prasanna S Panchamukhi Cc: Yumiko Sugita Cc: Satoshi Oshima Cc: Hideo Aoki Signed-off-by: Masami Hiramatsu Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index fc79e1e859c4..af1d53344993 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); mutex_unlock(&kprobe_mutex); } @@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 1; ss_probe: -#ifndef CONFIG_PREEMPT +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) if (p->ainsn.boostable == 1 && !p->post_handler){ /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); -- cgit v1.2.1 From a38a44c1a93078fc5fadc4ac2df8dea4697069e2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 6 Dec 2006 20:38:16 -0800 Subject: [PATCH] smp_call_function_single() check that local interrupts are enabled smp_call_function_single() can deadlock if the caller disabled local interrupts (the target CPU could be spinning on call_lock). Check for that. Why on earth do these functions use spin_lock_bh()?? Cc: "Randy.Dunlap" Cc: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/smp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 31e5c6573aae..9827cf927ecb 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -699,6 +699,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, put_cpu(); return -EBUSY; } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); -- cgit v1.2.1 From 02316067852187b8bec781bec07410e91af79627 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 6 Dec 2006 20:38:17 -0800 Subject: [PATCH] hotplug CPU: clean up hotcpu_notifier() use There was lots of #ifdef noise in the kernel due to hotcpu_notifier(fn, prio) not correctly marking 'fn' as used in the !HOTPLUG_CPU case, and thus generating compiler warnings of unused symbols, hence forcing people to add #ifdefs. the compiler can skip truly unused functions just fine: text data bss dec hex filename 1624412 728710 3674856 6027978 5bfaca vmlinux.before 1624412 728710 3674856 6027978 5bfaca vmlinux.after [akpm@osdl.org: topology.c fix] Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/mcheck/therm_throt.c | 2 -- arch/i386/kernel/cpuid.c | 2 -- arch/i386/kernel/microcode.c | 2 -- arch/i386/kernel/msr.c | 2 -- 4 files changed, 8 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c index bad8b4420709..065005c3f168 100644 --- a/arch/i386/kernel/cpu/mcheck/therm_throt.c +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -116,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); } -#ifdef CONFIG_HOTPLUG_CPU static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); @@ -153,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier = { .notifier_call = thermal_throttle_cpu_callback, }; -#endif /* CONFIG_HOTPLUG_CPU */ static __init int thermal_throttle_init_device(void) { diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index ab0c327e79dc..23b2cc748d4e 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -167,7 +167,6 @@ static int cpuid_device_create(int i) return err; } -#ifdef CONFIG_HOTPLUG_CPU static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -187,7 +186,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = { .notifier_call = cpuid_class_cpu_callback, }; -#endif /* !CONFIG_HOTPLUG_CPU */ static int __init cpuid_init(void) { diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index 23f5984d0654..972346604f9d 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = { .resume = mc_sysdev_resume, }; -#ifdef CONFIG_HOTPLUG_CPU static __cpuinit int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { @@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) static struct notifier_block mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -#endif static int __init microcode_init (void) { diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index a773f776c9ea..7763c67ca282 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -250,7 +250,6 @@ static int msr_device_create(int i) return err; } -#ifdef CONFIG_HOTPLUG_CPU static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -271,7 +270,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -#endif static int __init msr_init(void) { -- cgit v1.2.1 From cd6ed52568e161ce924593ebc798050a2d23cca0 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 6 Dec 2006 20:40:06 -0800 Subject: [PATCH] arch/i386/kernel/reboot.c should #include Every file should #include the headers containing the prototypes for its global functions. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/reboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 84278e0093a2..3514b4153f7f 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From 85916f8166b59eeac63d2b4f7f1df8de849334b4 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 6 Dec 2006 20:40:41 -0800 Subject: [PATCH] Kexec / Kdump: Unify elf note code The elf note saving code is currently duplicated over several architectures. This cleanup patch simply adds code to a common file and then replaces the arch-specific code with calls to the newly added code. The only drawback with this approach is that s390 doesn't fully support kexec-on-panic which for that arch leads to introduction of unused code. Signed-off-by: Magnus Damm Cc: Vivek Goyal Cc: Andi Kleen Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/crash.c | 66 ++---------------------------------------------- 1 file changed, 2 insertions(+), 64 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 144b43288965..a5e0e990ea95 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -31,68 +31,6 @@ /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) +3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -static void crash_save_this_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= NR_CPUS)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, so there is no need to invent something new. - */ - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - if (!buf) - return; - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); - final_note(buf); -} - -static void crash_save_self(struct pt_regs *regs) -{ - int cpu; - - cpu = safe_smp_processor_id(); - crash_save_this_cpu(regs, cpu); -} - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; @@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self, crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; } - crash_save_this_cpu(regs, cpu); + crash_save_cpu(regs, cpu); disable_local_APIC(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs) #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); #endif - crash_save_self(regs); + crash_save_cpu(regs, safe_smp_processor_id()); } -- cgit v1.2.1 From 91768d6c2bad0d2766a166f13f2f57e197de3458 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 8 Dec 2006 02:36:21 -0800 Subject: [PATCH] Generic BUG for i386 This makes i386 use the generic BUG machinery. There are no functional changes from the old i386 implementation. The main advantage in using the generic BUG machinery for i386 is that the inlined overhead of BUG is just the ud2a instruction; the file+line(+function) information are no longer inlined into the instruction stream. This reduces cache pollution, and makes disassembly work properly. Signed-off-by: Jeremy Fitzhardinge Cc: Andi Kleen Cc: Hugh Dickens Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig | 5 +++++ arch/i386/kernel/module.c | 4 +++- arch/i386/kernel/traps.c | 41 +++++++++++------------------------------ arch/i386/kernel/vmlinux.lds.S | 2 ++ 4 files changed, 21 insertions(+), 31 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index ea70359b02d0..c2362c7ba749 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -49,6 +49,11 @@ config GENERIC_IOMAP bool default y +config GENERIC_BUG + bool + default y + depends on BUG + config GENERIC_HWEIGHT bool default y diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c index d7d9c8b23f72..3db0a5442eb1 100644 --- a/arch/i386/kernel/module.c +++ b/arch/i386/kernel/module.c @@ -21,6 +21,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -141,10 +142,11 @@ int module_finalize(const Elf_Ehdr *hdr, apply_paravirt(pseg, pseg + para->sh_size); } - return 0; + return module_bug_finalize(hdr, sechdrs, me); } void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); + module_bug_cleanup(mod); } diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 68de48e498ca..2b30dbf8d117 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -30,6 +30,7 @@ #include #include #include +#include #ifdef CONFIG_EISA #include @@ -420,43 +421,22 @@ void show_registers(struct pt_regs *regs) printk("\n"); } -static void handle_BUG(struct pt_regs *regs) +int is_valid_bugaddr(unsigned long eip) { - unsigned long eip = regs->eip; unsigned short ud2; if (eip < PAGE_OFFSET) - return; + return 0; if (probe_kernel_address((unsigned short *)eip, ud2)) - return; - if (ud2 != 0x0b0f) - return; - - printk(KERN_EMERG "------------[ cut here ]------------\n"); - -#ifdef CONFIG_DEBUG_BUGVERBOSE - do { - unsigned short line; - char *file; - char c; - - if (probe_kernel_address((unsigned short *)(eip + 2), line)) - break; - if (probe_kernel_address((char **)(eip + 4), file) || - (unsigned long)file < PAGE_OFFSET || - probe_kernel_address(file, c)) - file = ""; + return 0; - printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); - return; - } while (0); -#endif - printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n"); + return ud2 == 0x0b0f; } -/* This is gone through when something in the kernel - * has done something bad and is about to be terminated. -*/ +/* + * This is gone through when something in the kernel has done something bad and + * is about to be terminated. + */ void die(const char * str, struct pt_regs * regs, long err) { static struct { @@ -488,7 +468,8 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long esp; unsigned short ss; - handle_BUG(regs); + report_bug(regs->eip); + printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk(KERN_EMERG "PREEMPT "); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 56e6ad5cb045..16d3c7133ad7 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -57,6 +57,8 @@ SECTIONS RODATA + BUG_TABLE + . = ALIGN(4); .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { __tracedata_start = .; -- cgit v1.2.1 From aab4c5a51cd6defc374ba1961c996781656d5283 Mon Sep 17 00:00:00 2001 From: "Josef \"Jeff\" Sipek" Date: Fri, 8 Dec 2006 02:36:42 -0800 Subject: [PATCH] i386: change uses of f_{dentry, vfsmnt} to use f_path Change all the uses of f_{dentry,vfsmnt} to f_path.{dentry,mnt} in the i386 arch code. Signed-off-by: Josef "Jeff" Sipek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpuid.c | 4 ++-- arch/i386/kernel/msr.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index db6dd20c3589..51130b39cd2e 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -116,7 +116,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, char __user *tmp = buf; u32 data[4]; u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); + int cpu = iminor(file->f_path.dentry->d_inode); if (count % 16) return -EINVAL; /* Invalid chunk size */ @@ -134,7 +134,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, static int cpuid_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_dentry->d_inode); + unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &(cpu_data)[cpu]; if (cpu >= NR_CPUS || !cpu_online(cpu)) diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index 1d1a56cae340..4a472a17d1c6 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -172,7 +172,7 @@ static ssize_t msr_read(struct file *file, char __user * buf, u32 __user *tmp = (u32 __user *) buf; u32 data[2]; u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); + int cpu = iminor(file->f_path.dentry->d_inode); int err; if (count % 8) @@ -196,7 +196,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf, const u32 __user *tmp = (const u32 __user *)buf; u32 data[2]; u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); + int cpu = iminor(file->f_path.dentry->d_inode); int err; if (count % 8) @@ -216,7 +216,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf, static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_dentry->d_inode); + unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &(cpu_data)[cpu]; if (cpu >= NR_CPUS || !cpu_online(cpu)) -- cgit v1.2.1 From f0d1b0b30d250a07627ad8b9fbbb5c7cc08422e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 Dec 2006 02:37:49 -0800 Subject: [PATCH] LOG2: Implement a general integer log2 facility in the kernel This facility provides three entry points: ilog2() Log base 2 of unsigned long ilog2_u32() Log base 2 of u32 ilog2_u64() Log base 2 of u64 These facilities can either be used inside functions on dynamic data: int do_something(long q) { ...; y = ilog2(x) ...; } Or can be used to statically initialise global variables with constant values: unsigned n = ilog2(27); When performing static initialisation, the compiler will report "error: initializer element is not constant" if asked to take a log of zero or of something not reducible to a constant. They treat negative numbers as unsigned. When not dealing with a constant, they fall back to using fls() which permits them to use arch-specific log calculation instructions - such as BSR on x86/x86_64 or SCAN on FRV - if available. [akpm@osdl.org: MMC fix] Signed-off-by: David Howells Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Herbert Xu Cc: David Howells Cc: Wojtek Kaniewski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/Kconfig.cpu | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig.cpu b/arch/i386/Kconfig.cpu index 821fd269ca58..2aecfba4ac4f 100644 --- a/arch/i386/Kconfig.cpu +++ b/arch/i386/Kconfig.cpu @@ -248,6 +248,14 @@ config RWSEM_XCHGADD_ALGORITHM depends on !M386 default y +config ARCH_HAS_ILOG2_U32 + bool + default n + +config ARCH_HAS_ILOG2_U64 + bool + default n + config GENERIC_CALIBRATE_DELAY bool default y -- cgit v1.2.1 From b5bd1cec8999624fa2b32833ba9707eb4966df17 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Dec 2006 02:40:46 -0800 Subject: [PATCH] visws: sgivwfb as module needs exports With CONFIG_FB_SGIVW=m: WARNING: "sgivwfb_mem_size" [drivers/video/sgivwfb.ko] undefined! WARNING: "sgivwfb_mem_phys" [drivers/video/sgivwfb.ko] undefined! (or don't allow FB_SGIVW=m in Kconfig) Signed-off-by: Randy Dunlap Acked-by: James Simmons Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mach-visws/setup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/i386') diff --git a/arch/i386/mach-visws/setup.c b/arch/i386/mach-visws/setup.c index 885c7cbfd478..233ee20907b9 100644 --- a/arch/i386/mach-visws/setup.c +++ b/arch/i386/mach-visws/setup.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -142,6 +143,8 @@ void __init time_init_hook(void) unsigned long sgivwfb_mem_phys; unsigned long sgivwfb_mem_size; +EXPORT_SYMBOL(sgivwfb_mem_phys); +EXPORT_SYMBOL(sgivwfb_mem_size); long long mem_size __initdata = 0; -- cgit v1.2.1 From 3b1bdf4e08d6a8d4fae5a30224ed2c55bf1e43fc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Dec 2006 02:41:13 -0800 Subject: [PATCH] CPU hotplug broken with 2GB VMSPLIT In VMSPLIT mode, kernel PGD might have more entries than user space. Acked-by: Jens Axboe Signed-off-by: Shaohua Li Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4bf0e3c83b8b..1e00b03163b9 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1118,7 +1118,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu) /* init low mem mapping */ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - KERNEL_PGD_PTRS); + min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); flush_tlb_all(); schedule_work(&info.task); wait_for_completion(&done); -- cgit v1.2.1 From e0f27981f2d5fd93b2795e8348327b081c512f83 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 9 Dec 2006 21:33:35 +0100 Subject: [PATCH] i386: Update defconfig Signed-off-by: Andi Kleen --- arch/i386/defconfig | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/defconfig b/arch/i386/defconfig index 65891f11aced..3265208e5899 100644 --- a/arch/i386/defconfig +++ b/arch/i386/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.19-git7 -# Wed Dec 6 23:50:49 2006 +# Linux kernel version: 2.6.19-git14 +# Sat Dec 9 21:23:14 2006 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -12,6 +12,7 @@ CONFIG_X86=y CONFIG_MMU=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y +CONFIG_GENERIC_BUG=y CONFIG_GENERIC_HWEIGHT=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_DMI=y @@ -141,6 +142,8 @@ CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_RWSEM_XCHGADD_ALGORITHM=y +# CONFIG_ARCH_HAS_ILOG2_U32 is not set +# CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y @@ -203,6 +206,7 @@ CONFIG_MTRR=y CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y +# CONFIG_HZ_300 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=250 # CONFIG_KEXEC is not set @@ -563,6 +567,7 @@ CONFIG_IDEDMA_AUTO=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +# CONFIG_SCSI_TGT is not set CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set @@ -583,6 +588,7 @@ CONFIG_CHR_DEV_SG=y # CONFIG_SCSI_MULTI_LUN is not set # CONFIG_SCSI_CONSTANTS is not set # CONFIG_SCSI_LOGGING is not set +# CONFIG_SCSI_SCAN_ASYNC is not set # # SCSI Transports @@ -642,6 +648,7 @@ CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_SCSI_DC390T is not set # CONFIG_SCSI_NSP32 is not set # CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_SRP is not set # # Serial ATA (prod) and Parallel ATA (experimental) drivers @@ -1082,10 +1089,7 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -CONFIG_OSS_OBSOLETE_DRIVER=y # CONFIG_SOUND_BT878 is not set -# CONFIG_SOUND_EMU10K1 is not set -# CONFIG_SOUND_FUSION is not set # CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y # CONFIG_SOUND_TRIDENT is not set @@ -1094,6 +1098,11 @@ CONFIG_SOUND_ICH=y # CONFIG_SOUND_VIA82CXXX is not set # CONFIG_SOUND_OSS is not set +# +# HID Devices +# +CONFIG_HID=y + # # USB support # @@ -1158,8 +1167,7 @@ CONFIG_USB_STORAGE=y # USB Input Devices # CONFIG_USB_HID=y -CONFIG_USB_HIDINPUT=y -# CONFIG_USB_HIDINPUT_POWERBOOK is not set +# CONFIG_USB_HID_POWERBOOK is not set # CONFIG_HID_FF is not set # CONFIG_USB_HIDDEV is not set # CONFIG_USB_AIPTEK is not set @@ -1443,6 +1451,11 @@ CONFIG_NLS_ISO8859_15=y # CONFIG_NLS_KOI8_U is not set CONFIG_NLS_UTF8=y +# +# Distributed Lock Manager +# +# CONFIG_DLM is not set + # # Instrumentation Support # @@ -1509,6 +1522,7 @@ CONFIG_DOUBLEFAULT=y # # Library routines # +CONFIG_BITREVERSE=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set CONFIG_CRC32=y -- cgit v1.2.1 From 16d279d277aedd640d9dba5ddeb172b5e6bc7d75 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 9 Dec 2006 21:33:35 +0100 Subject: [PATCH] x86: Fix verify_quirk_intel_irqbalance() Fix verify_quirk_intel_irqbalance(). genapic checks should really happen only on affected versions of the E7520/E7320/E7525 based platforms. AK: This should akpm's Coyote SDV Signed-off-by: Suresh Siddha Signed-off-by: Andi Kleen --- arch/i386/kernel/quirks.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index a01320a7b636..34874c398b44 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -10,13 +10,38 @@ #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) { + u8 config, rev; + u32 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * For those platforms, make sure that the genapic is set to 'flat' + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* read xTPR register */ + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + + if (!(word & (1 << 13))) { #ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); + if (genapic != &apic_flat) + panic("APIC mode must be flat on this system\n"); #elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); + if (genapic != &apic_default) + panic("APIC mode must be default(flat) on this system. Use apic=default\n"); #endif + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); } void __init quirk_intel_irqbalance(void) -- cgit v1.2.1 From 92715e282be7c7488f892703c8d39b08976a833b Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Sat, 9 Dec 2006 21:33:35 +0100 Subject: [PATCH] x86: Fix boot hang due to nmi watchdog init code 2.6.19 stopped booting (or booted based on build/config) on our x86_64 systems due to a bug introduced in 2.6.19. check_nmi_watchdog schedules an IPI on all cpus to busy wait on a flag, but fails to set the busywait flag if NMI functionality is disabled. This causes the secondary cpus to spin in an endless loop, causing the kernel bootup to hang. Depending upon the build, the busywait flag got overwritten (stack variable) and caused the kernel to bootup on certain builds. Following patch fixes the bug by setting the busywait flag before returning from check_nmi_watchdog. I guess using a stack variable is not good here as the calling function could potentially return while the busy wait loop is still spinning on the flag. AK: I redid the patch significantly to be cleaner Signed-off-by: Ravikiran Thirumalai Signed-off-by: Shai Fultheim Signed-off-by: Andi Kleen --- arch/i386/kernel/nmi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index f5bc7e1be801..a5e34d655965 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -195,6 +195,8 @@ static __cpuinit inline int nmi_known_cpu(void) return 0; } +static int endflag __initdata = 0; + #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -202,7 +204,6 @@ static __cpuinit inline int nmi_known_cpu(void) */ static __init void nmi_cpu_busy(void *data) { - volatile int *endflag = data; local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, @@ -210,14 +211,13 @@ static __init void nmi_cpu_busy(void *data) pause instruction. On a real HT machine this is fine because all other CPUs are busy with "useless" delay loops and don't care if they get somewhat less cycles. */ - while (*endflag == 0) - barrier(); + while (endflag == 0) + mb(); } #endif static int __init check_nmi_watchdog(void) { - volatile int endflag = 0; unsigned int *prev_nmi_count; int cpu; -- cgit v1.2.1 From 1bac3b383a93a4a920ffc57441eb133c78567fbd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 9 Dec 2006 21:33:36 +0100 Subject: [PATCH] x86: Work around gcc 4.2 over aggressive optimizer The new PDA code uses a dummy _proxy_pda variable to describe memory references to the PDA. It is never referenced in inline assembly, but exists as input/output arguments. gcc 4.2 in some cases can CSE references to this which causes unresolved symbols. Define it to zero to avoid this. Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 16d3c7133ad7..a53c8b1854b5 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -26,6 +26,7 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; +_proxy_pda = 0; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ -- cgit v1.2.1 From 7e74437cf60cc84a655e301f1ee48027b3bcf96e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 9 Dec 2006 21:33:36 +0100 Subject: [PATCH] i386: export smp_num_siblings for oprofile oprofile uses smp_num_siblings without testing for CONFIG_X86_HT. I looked at modifying oprofile, but this way is cleaner & simpler and I didn't see a good reason not to just export it when CONFIG_SMP. WARNING: "smp_num_siblings" [arch/i386/oprofile/oprofile.ko] undefined! Signed-off-by: Randy Dunlap Signed-off-by: Andi Kleen --- arch/i386/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 1e00b03163b9..b0f84e5778ad 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -69,7 +69,7 @@ static int __devinitdata smp_b_stepping; /* Number of siblings per CPU package */ int smp_num_siblings = 1; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_SMP EXPORT_SYMBOL(smp_num_siblings); #endif -- cgit v1.2.1 From 306a22c2a20edf3a440bfb4cf2b94c19dfd81a53 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 9 Dec 2006 21:33:36 +0100 Subject: [PATCH] i386: Fix io_apic.c warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 4.2 warns linux/arch/i386/kernel/io_apic.c: In function ‘create_irq’: linux/arch/i386/kernel/io_apic.c:2488: warning: ‘vector’ may be used uninitialized in this function The warning is false, but somewhat legitimate so work around it. Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/i386') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index e21dcde0790e..2424cc9c7b3d 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -2485,7 +2485,7 @@ device_initcall(ioapic_init_sysfs); int create_irq(void) { /* Allocate an unused irq */ - int irq, new, vector; + int irq, new, vector = 0; unsigned long flags; irq = -ENOSPC; -- cgit v1.2.1 From f0f32fccbffaaba8596d5c671153aa37aea9d4f0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 9 Dec 2006 21:33:36 +0100 Subject: [PATCH] x86-64: no paravirt for X86_VOYAGER or X86_VISWS Since Voyager and Visual WS already define ARCH_SETUP, it looks like PARAVIRT shouldn't be offered for them. In file included from arch/i386/kernel/setup.c:63: include/asm-i386/mach-visws/setup_arch.h:8:1: warning: "ARCH_SETUP" redefin= ed In file included from include/asm/msr.h:5, from include/asm/processor.h:17, from include/asm/thread_info.h:16, from include/linux/thread_info.h:21, from include/linux/preempt.h:9, from include/linux/spinlock.h:49, from include/linux/capability.h:45, from include/linux/sched.h:46, from arch/i386/kernel/setup.c:26: include/asm/paravirt.h:163:1: warning: this is the location of the previous= definition In file included from arch/i386/kernel/setup.c:63: include/asm-i386/mach-visws/setup_arch.h:8:1: warning: "ARCH_SETUP" redefin= ed In file included from include/asm/msr.h:5, from include/asm/processor.h:17, from include/asm/thread_info.h:16, from include/linux/thread_info.h:21, from include/linux/preempt.h:9, from include/linux/spinlock.h:49, from include/linux/capability.h:45, from include/linux/sched.h:46, from arch/i386/kernel/setup.c:26: include/asm/paravirt.h:163:1: warning: this is the location of the previous= definition Signed-off-by: Randy Dunlap --- arch/i386/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/i386') diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index c2362c7ba749..0d67a0a1151e 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -190,6 +190,7 @@ endchoice config PARAVIRT bool "Paravirtualization support (EXPERIMENTAL)" depends on EXPERIMENTAL + depends on !(X86_VISWS || X86_VOYAGER) help Paravirtualization is a way of running multiple instances of Linux on the same machine, under a hypervisor. This option -- cgit v1.2.1