diff options
Diffstat (limited to 'arch/x86/kernel')
49 files changed, 1141 insertions, 615 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index da140611bb57..b78a17b12810 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_OLPC) += olpc.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o + obj-y += bios_uv.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 868de3d5c39d..a3ddad18aaa3 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -9,6 +9,7 @@ #include <linux/bootmem.h> #include <linux/dmi.h> #include <linux/cpumask.h> +#include <asm/segment.h> #include "realmode/wakeup.h" #include "sleep.h" @@ -23,15 +24,6 @@ static unsigned long acpi_realmode; static char temp_stack[10240]; #endif -/* XXX: this macro should move to asm-x86/segment.h and be shared with the - boot code... */ -#define GDT_ENTRY(flags, base, limit) \ - (((u64)(base & 0xff000000) << 32) | \ - ((u64)flags << 40) | \ - ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffffff) << 16) | \ - ((u64)(limit & 0x0000ffff))) - /** * acpi_save_state_mem - save kernel state * diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a0..c25210e6ac88 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -23,7 +23,7 @@ #include <linux/scatterlist.h> #include <linux/iommu-helper.h> #include <asm/proto.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> @@ -32,21 +32,37 @@ #define to_pages(addr, size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) +#define EXIT_LOOP_COUNT 10000000 + static DEFINE_RWLOCK(amd_iommu_devtable_lock); -struct command { +/* + * general struct to manage commands send to an IOMMU + */ +struct iommu_cmd { u32 data[4]; }; static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +/* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) { return iommu->cap & IOMMU_CAP_NPCACHE; } -static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +/**************************************************************************** + * + * IOMMU command queuing functions + * + ****************************************************************************/ + +/* + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. Must be called with iommu->lock held. + */ +static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { u32 tail, head; u8 *target; @@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return 0; } -static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +/* + * General queuing function for commands. Takes iommu->lock and calls + * __iommu_queue_command(). + */ +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { unsigned long flags; int ret; @@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return ret; } +/* + * This function is called whenever we need to ensure that the IOMMU has + * completed execution of all commands we sent. It sends a + * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs + * us about that by writing a value to a physical address we pass with + * the command. + */ static int iommu_completion_wait(struct amd_iommu *iommu) { int ret; - struct command cmd; + struct iommu_cmd cmd; volatile u64 ready = 0; unsigned long ready_phys = virt_to_phys(&ready); + unsigned long i = 0; memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - cmd.data[1] = HIGH_U32(ready_phys); + cmd.data[1] = upper_32_bits(ready_phys); cmd.data[2] = 1; /* value written to 'ready' */ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); @@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (ret) return ret; - while (!ready) + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; cpu_relax(); + } + + if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) + printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); return 0; } +/* + * Command send function for invalidating a device table entry + */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { - struct command cmd; + struct iommu_cmd cmd; BUG_ON(iommu == NULL); @@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return iommu_queue_command(iommu, &cmd); } +/* + * Generic command send function for invalidaing TLB entries + */ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { - struct command cmd; + struct iommu_cmd cmd; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); cmd.data[1] |= domid; cmd.data[2] = LOW_U32(address); - cmd.data[3] = HIGH_U32(address); - if (s) + cmd.data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; iommu->need_sync = 1; @@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, return iommu_queue_command(iommu, &cmd); } +/* + * TLB invalidation function which is called from the mapping functions. + * It invalidates a single PTE if the range to flush is within a single + * page. Otherwise it flushes the whole TLB of the IOMMU. + */ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { @@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ static int iommu_map(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, @@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom, return 0; } +/* + * This function checks if a specific unity mapping entry is needed for + * this specific IOMMU. + */ static int iommu_for_unity_map(struct amd_iommu *iommu, struct unity_map_entry *entry) { @@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ static int iommu_init_unity_mappings(struct amd_iommu *iommu) { struct unity_map_entry *entry; @@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu) return 0; } +/* + * This function actually applies the mapping to the page table of the + * dma_ops domain. + */ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e) { @@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Inits the unity mappings required for a specific device + */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, u16 devid) { @@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, return 0; } +/**************************************************************************** + * + * The next functions belong to the address allocator for the dma_ops + * interface functions. They work like the allocators in the other IOMMU + * drivers. Its basically a bitmap which marks the allocated pages in + * the aperture. Maybe it could be enhanced in the future to a more + * efficient allocator. + * + ****************************************************************************/ static unsigned long dma_mask_to_pages(unsigned long mask) { return (mask >> PAGE_SHIFT) + (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); } +/* + * The address allocator core function. + * + * called with domain->lock held + */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages) @@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, return address; } +/* + * The address free function. + * + * called with domain->lock held + */ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) @@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, iommu_area_free(dom->bitmap, address, pages); } +/**************************************************************************** + * + * The next functions belong to the domain allocation. A domain is + * allocated for every IOMMU as the default domain. If device isolation + * is enabled, every device get its own domain. The most important thing + * about domains is the page table mapping the DMA address space they + * contain. + * + ****************************************************************************/ + static u16 domain_id_alloc(void) { unsigned long flags; @@ -342,6 +446,10 @@ static u16 domain_id_alloc(void) return id; } +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) @@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) free_page((unsigned long)p1); } +/* + * Free a domain, only used if something went wrong in the + * allocation path and we need to free an already allocated page table + */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { if (!dom) @@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) kfree(dom); } +/* + * Allocates a new protection domain usable for the dma_ops functions. + * It also intializes the page table and the address allocator data + * structures required for the dma_ops interface + */ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { @@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; @@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_ops_reserve_addresses(dma_dom, startpage, pages); } + /* + * At the last step, build the page tables so we don't need to + * allocate page table pages in the dma_ops mapping/unmapping + * path. + */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), GFP_KERNEL); @@ -472,6 +595,10 @@ free_dma_dom: return NULL; } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ static struct protection_domain *domain_for_device(u16 devid) { struct protection_domain *dom; @@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ static void set_device_domain(struct amd_iommu *iommu, struct protection_domain *domain, u16 devid) @@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu, iommu->need_sync = 1; } +/***************************************************************************** + * + * The next functions belong to the dma_ops mapping/unmapping code. + * + *****************************************************************************/ + +/* + * In the dma_ops path we only have the struct device. This function + * finds the corresponding IOMMU, the protection domain and the + * requestor id for a given device. + * If the device is not yet associated with a domain this is also done + * in this function. + */ static int get_device_resources(struct device *dev, struct amd_iommu **iommu, struct protection_domain **domain, @@ -520,8 +664,9 @@ static int get_device_resources(struct device *dev, BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); pcidev = to_pci_dev(dev); - _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + /* device not translated by any IOMMU in the system? */ if (_bdf >= amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; @@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * This is the generic map function. It maps one 4kb page at paddr to + * the given address in the DMA address space for the domain. + */ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address, @@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, return (dma_addr_t)address; } +/* + * The generic unmapping function for on page in the DMA address space. + */ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) @@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, *pte = 0ULL; } +/* + * This function contains common code for mapping of a physically + * contiguous memory region into DMA address space. It is uses by all + * mapping functions provided by this IOMMU driver. + * Must be called with the domain lock held. + */ static dma_addr_t __map_single(struct device *dev, struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, @@ -628,6 +786,10 @@ out: return address; } +/* + * Does the reverse of the __map_single function. Must be called with + * the domain lock held too + */ static void __unmap_single(struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, @@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); } +/* + * The exported map_single function for dma_ops. + */ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) { @@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) + /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); @@ -683,6 +849,9 @@ out: return addr; } +/* + * The exported unmap_single function for dma_ops. + */ static void unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int dir) { @@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, u16 devid; if (!get_device_resources(dev, &iommu, &domain, &devid)) + /* device not handled by any AMD IOMMU */ return; spin_lock_irqsave(&domain->lock, flags); @@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * This is a special map_sg function which is used if we should map a + * device which is not handled by an AMD IOMMU in the system. + */ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, return nelems; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static int map_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -775,6 +953,10 @@ unmap: goto out; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static void unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * The exported alloc_coherent function for dma_ops. + */ static void *alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -851,6 +1036,11 @@ out: return virt_addr; } +/* + * The exported free_coherent function for dma_ops. + * FIXME: fix the generic x86 DMA layer so that it actually calls that + * function. + */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { @@ -879,6 +1069,8 @@ free_mem: } /* + * The function for pre-allocating protection domains. + * * If the driver core informs the DMA layer if a driver grabs a device * we don't need to preallocate the protection domains anymore. * For now we have to. @@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_sg = unmap_sg, }; +/* + * The function which clues the AMD IOMMU driver into dma_ops. + */ int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; int order = amd_iommu_aperture_order; int ret; + /* + * first allocate a default protection domain for every IOMMU we + * found in the system. Devices not assigned to any other + * protection domain will be assigned to the default one. + */ list_for_each_entry(iommu, &amd_iommu_list, list) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) @@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void) goto free_domains; } + /* + * If device isolation is enabled, pre-allocate the protection + * domains for each device. + */ if (amd_iommu_isolate) prealloc_protection_domains(); @@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void) gart_iommu_aperture = 0; #endif + /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437d..c9d8ff2eb130 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -25,20 +25,13 @@ #include <asm/pci-direct.h> #include <asm/amd_iommu_types.h> #include <asm/amd_iommu.h> -#include <asm/gart.h> +#include <asm/iommu.h> /* * definitions for the ACPI scanning code */ -#define UPDATE_LAST_BDF(x) do {\ - if ((x) > amd_iommu_last_bdf) \ - amd_iommu_last_bdf = (x); \ - } while (0); - -#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 -#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) #define ACPI_IVHD_TYPE 0x10 #define ACPI_IVMD_TYPE_ALL 0x20 @@ -71,6 +64,17 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +/* + * ACPI table definitions + * + * These data structures are laid over the table to parse the important values + * out of it. + */ + +/* + * structure describing one IOMMU in the ACPI table. Typically followed by one + * or more ivhd_entrys. + */ struct ivhd_header { u8 type; u8 flags; @@ -83,6 +87,10 @@ struct ivhd_header { u32 reserved; } __attribute__((packed)); +/* + * A device entry describing which devices a specific IOMMU translates and + * which requestor ids they use. + */ struct ivhd_entry { u8 type; u16 devid; @@ -90,6 +98,10 @@ struct ivhd_entry { u32 ext; } __attribute__((packed)); +/* + * An AMD IOMMU memory definition structure. It defines things like exclusion + * ranges for devices and regions that should be unity mapped. + */ struct ivmd_header { u8 type; u8 flags; @@ -103,22 +115,80 @@ struct ivmd_header { static int __initdata amd_iommu_detected; -u16 amd_iommu_last_bdf; -struct list_head amd_iommu_unity_map; -unsigned amd_iommu_aperture_order = 26; -int amd_iommu_isolate; +u16 amd_iommu_last_bdf; /* largest PCI device id we have + to handle */ +LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings + we find in ACPI */ +unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +int amd_iommu_isolate; /* if 1, device isolation is enabled */ + +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the + system */ -struct list_head amd_iommu_list; +/* + * Pointer to the device table which is shared by all AMD IOMMUs + * it is indexed by the PCI device id or the HT unit id and contains + * information about the domain the device belongs to as well as the + * page table root pointer. + */ struct dev_table_entry *amd_iommu_dev_table; + +/* + * The alias table is a driver specific data structure which contains the + * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + * More than one device can share the same requestor id. + */ u16 *amd_iommu_alias_table; + +/* + * The rlookup table is used to find the IOMMU which is responsible + * for a specific device. It is also indexed by the PCI device id. + */ struct amd_iommu **amd_iommu_rlookup_table; + +/* + * The pd table (protection domain table) is used to find the protection domain + * data structure a device belongs to. Indexed with the PCI device id too. + */ struct protection_domain **amd_iommu_pd_table; + +/* + * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + * to know which ones are already in use. + */ unsigned long *amd_iommu_pd_alloc_bitmap; -static u32 dev_table_size; -static u32 alias_table_size; -static u32 rlookup_table_size; +static u32 dev_table_size; /* size of the device table */ +static u32 alias_table_size; /* size of the alias table */ +static u32 rlookup_table_size; /* size if the rlookup table */ +static inline void update_last_devid(u16 devid) +{ + if (devid > amd_iommu_last_bdf) + amd_iommu_last_bdf = devid; +} + +static inline unsigned long tbl_size(int entry_size) +{ + unsigned shift = PAGE_SHIFT + + get_order(amd_iommu_last_bdf * entry_size); + + return 1UL << shift; +} + +/**************************************************************************** + * + * AMD IOMMU MMIO register space handling functions + * + * These functions are used to program the IOMMU device registers in + * MMIO space required for that driver. + * + ****************************************************************************/ + +/* + * This function set the exclusion range in the IOMMU. DMA accesses to the + * exclusion range are passed through untranslated + */ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; @@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { u32 entry; @@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Generic functions to enable/disable certain features of the IOMMU. */ static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +/* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); @@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* + * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + * the system has one. + */ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; @@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); } +/**************************************************************************** + * + * The functions below belong to the first pass of AMD IOMMU ACPI table + * parsing. In this pass we try to find out the highest device id this + * code has to handle. Upon this information the size of the shared data + * structures is determined later. + * + ****************************************************************************/ + +/* + * This function reads the last device id the IOMMU has to handle from the PCI + * capability header for this IOMMU + */ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) { u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } +/* + * After reading the highest device id from the IOMMU PCI capability header + * this function looks if there is a higher device id defined in the ACPI table + */ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) { u8 *p = (void *)h, *end = (void *)h; @@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_RANGE_END: case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: - UPDATE_LAST_BDF(dev->devid); + /* all the above subfield types refer to device ids */ + update_last_devid(dev->devid); break; default: break; @@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) return 0; } +/* + * Iterate over all IVHD entries in the ACPI table and find the highest device + * id which we need to handle. This is the first of three functions which parse + * the ACPI table. So we check the checksum here. + */ static int __init find_last_devid_acpi(struct acpi_table_header *table) { int i; @@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions belong the the code path which parses the ACPI table + * the second time. In this ACPI parsing iteration we allocate IOMMU specific + * data structures, initialize the device/alias/rlookup table and also + * basically initialize the hardware. + * + ****************************************************************************/ + +/* + * Allocates the command buffer. This buffer is per AMD IOMMU. We can + * write commands to that buffer later and the IOMMU will execute them + * asynchronously + */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { - u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry = 0; + u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - memset(cmd_buf, 0, CMD_BUFFER_SIZE); - entry = (u64)virt_to_phys(cmd_buf); entry |= MMIO_CMD_SIZE_512; memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, @@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - if (iommu->cmd_buf) - free_pages((unsigned long)iommu->cmd_buf, - get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); } +/* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { int i = (bit >> 5) & 0x07; @@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } -static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) +/* Writes the specific IOMMU for a device into the rlookup table */ +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ + amd_iommu_rlookup_table[devid] = iommu; +} + +/* + * This function takes the device specific flags read from the ACPI + * table and sets up the device table entry with that information + */ +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); @@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); -} -static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) -{ - amd_iommu_rlookup_table[devid] = iommu; + set_iommu_for_device(iommu, devid); } +/* + * Reads the device exclusion range from ACPI and initialize IOMMU with + * it + */ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) { struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; @@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) return; if (iommu) { + /* + * We only can configure exclusion ranges per IOMMU, not + * per device. But we can enable the exclusion range per + * device. This is done here + */ set_dev_entry_bit(m->devid, DEV_ENTRY_EX); iommu->exclusion_start = m->range_start; iommu->exclusion_length = m->range_length; } } +/* + * This function reads some important data from the IOMMU PCI space and + * initializes the driver data structure with it. It reads the hardware + * capabilities and the first/last device entries + */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int bus = PCI_BUS(iommu->devid); @@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); - iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + iommu->first_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_FD(range)); + iommu->last_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_LD(range)); } +/* + * Takes a pointer to an AMD IOMMU entry in the ACPI table and + * initializes the hardware and our data structures with it. + */ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_header *h) { @@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, u8 *end = p, flags = 0; u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; u32 ext_flags = 0; - bool alias = 0; + bool alias = false; struct ivhd_entry *e; /* @@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALL: for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) - set_dev_entry_from_acpi(dev_i, e->flags, 0); + set_dev_entry_from_acpi(iommu, dev_i, + e->flags, 0); break; case IVHD_DEV_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: devid_start = e->devid; flags = e->flags; ext_flags = 0; - alias = 0; + alias = false; break; case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: @@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, flags = e->flags; devid_to = e->ext >> 8; ext_flags = 0; - alias = 1; + alias = true; break; case IVHD_DEV_EXT_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, e->ext); + set_dev_entry_from_acpi(iommu, devid, e->flags, + e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: devid_start = e->devid; flags = e->flags; ext_flags = e->ext; - alias = 0; + alias = false; break; case IVHD_DEV_RANGE_END: devid = e->devid; for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi( + set_dev_entry_from_acpi(iommu, amd_iommu_alias_table[dev_i], flags, ext_flags); } @@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, } } +/* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { u16 i; @@ -494,6 +636,11 @@ static void __init free_iommu_all(void) } } +/* + * This function clues the initialization function for one IOMMU + * together and also allocates the command buffer and programs the + * hardware. It does NOT enable the IOMMU. This is done afterwards. + */ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); @@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) return 0; } +/* + * Iterates over all IOMMU entries in the ACPI table, allocates the + * IOMMU structure and initializes it with init_iommu_one() + */ static int __init init_iommu_all(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) struct amd_iommu *iommu; int ret; - INIT_LIST_HEAD(&amd_iommu_list); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The next functions belong to the third pass of parsing the ACPI + * table. In this last pass the memory mapping requirements are + * gathered (like exclusion and unity mapping reanges). + * + ****************************************************************************/ + static void __init free_unity_maps(void) { struct unity_map_entry *entry, *next; @@ -565,6 +722,7 @@ static void __init free_unity_maps(void) } } +/* called when we find an exclusion range definition in ACPI */ static int __init init_exclusion_range(struct ivmd_header *m) { int i; @@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) return 0; } +/* called for unity map ACPI definition */ static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; @@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m) return 0; } +/* iterates over all memory definitions we find in the ACPI table */ static int __init init_memory_definitions(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; struct ivmd_header *m; - INIT_LIST_HEAD(&amd_iommu_unity_map); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +/* + * This function finally enables all IOMMUs found in the system after + * they have been initialized + */ static void __init enable_iommus(void) { struct amd_iommu *iommu; @@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = { .cls = &amd_iommu_sysdev_class, }; +/* + * This is the core init function for AMD IOMMU hardware in the system. + * This function is called from the generic x86 DMA layer initialization + * code. + * + * This function basically parses the ACPI table for AMD IOMMU (IVRS) + * three times: + * + * 1 pass) Find the highest PCI device id the driver has to handle. + * Upon this information the size of the data structures is + * determined that needs to be allocated. + * + * 2 pass) Initialize the data structures just allocated with the + * information in the ACPI table about available AMD IOMMUs + * in the system. It also maps the PCI devices in the + * system to specific IOMMUs + * + * 3 pass) After the basic data structures are allocated and + * initialized we update them with information about memory + * remapping requirements parsed out of the ACPI table in + * this last pass. + * + * After that the hardware is initialized and ready to go. In the last + * step we do some Linux specific things like registering the driver in + * the dma_ops interface and initializing the suspend/resume support + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ int __init amd_iommu_init(void) { int i, ret = 0; @@ -699,14 +889,14 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; - dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); - alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); - rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); + alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); + rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); ret = -ENOMEM; /* Device table - directly used by all IOMMUs */ - amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(dev_table_size)); if (amd_iommu_dev_table == NULL) goto out; @@ -730,27 +920,23 @@ int __init amd_iommu_init(void) * Protection Domain table - maps devices to protection domains * This table has the same size as the rlookup_table */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_pd_table == NULL) goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); if (amd_iommu_pd_alloc_bitmap == NULL) goto free; /* - * memory is allocated now; initialize the device table with all zeroes - * and let all alias entries point to itself + * let all alias entries point to itself */ - memset(amd_iommu_dev_table, 0, dev_table_size); for (i = 0; i < amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; - memset(amd_iommu_pd_table, 0, rlookup_table_size); - memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); - /* * never allocate domain 0 because its used as the non-allocated and * error value placeholder @@ -795,24 +981,19 @@ out: return ret; free: - if (amd_iommu_pd_alloc_bitmap) - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); - if (amd_iommu_pd_table) - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_pd_table, + get_order(rlookup_table_size)); - if (amd_iommu_rlookup_table) - free_pages((unsigned long)amd_iommu_rlookup_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_rlookup_table, + get_order(rlookup_table_size)); - if (amd_iommu_alias_table) - free_pages((unsigned long)amd_iommu_alias_table, - get_order(alias_table_size)); + free_pages((unsigned long)amd_iommu_alias_table, + get_order(alias_table_size)); - if (amd_iommu_dev_table) - free_pages((unsigned long)amd_iommu_dev_table, - get_order(dev_table_size)); + free_pages((unsigned long)amd_iommu_dev_table, + get_order(dev_table_size)); free_iommu_all(); @@ -821,6 +1002,13 @@ free: goto out; } +/**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA + * layer. It just looks if there is an IVRS ACPI table to detect AMD + * IOMMUs + * + ****************************************************************************/ static int __init early_amd_iommu_detect(struct acpi_table_header *table) { return 0; @@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || iommu_detected) + if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { @@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void) } } +/**************************************************************************** + * + * Parsing functions for the AMD IOMMU specific kernel command line + * options. + * + ****************************************************************************/ + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str) static int __init parse_amd_iommu_size_options(char *str) { - for (; *str; ++str) { - if (strcmp(str, "32M") == 0) - amd_iommu_aperture_order = 25; - if (strcmp(str, "64M") == 0) - amd_iommu_aperture_order = 26; - if (strcmp(str, "128M") == 0) - amd_iommu_aperture_order = 27; - if (strcmp(str, "256M") == 0) - amd_iommu_aperture_order = 28; - if (strcmp(str, "512M") == 0) - amd_iommu_aperture_order = 29; - if (strcmp(str, "1G") == 0) - amd_iommu_aperture_order = 30; - } + unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); + + if ((order > 24) && (order < 31)) + amd_iommu_aperture_order = order; return 1; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9f907806c1a5..44e21826db11 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <asm/e820.h> #include <asm/io.h> +#include <asm/iommu.h> #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a437d027f20b..d6c898358371 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; int pic_mode; @@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void) /* Level triggered for 82489DX */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + apic_write(APIC_LVT0, v); } /** @@ -212,9 +212,6 @@ int lapic_get_maxlvt(void) * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. */ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { @@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) if (!irqen) lvtt_value |= APIC_LVT_MASKED; - apic_write_around(APIC_LVTT, lvtt_value); + apic_write(APIC_LVTT, lvtt_value); /* * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* @@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { - apic_write_around(APIC_TMICT, delta); + apic_write(APIC_TMICT, delta); return 0; } @@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); + apic_write(APIC_LVTT, v); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); const long pm_100ms = PMTMR_TICKS_PER_SEC/10; @@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void) long delta, deltapm; int pm_referenced = 0; - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (local_apic_timer_disabled) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - local_irq_disable(); /* Replace the global interrupt handler */ @@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void) calibration_result / (1000000 / HZ), calibration_result % (1000000 / HZ)); - local_apic_timer_verify_ok = 1; - /* * Do a sanity check on the APIC calibration result */ @@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); printk(KERN_WARNING "APIC frequency too slow, disabling apic timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; + return -1; } + local_apic_timer_verify_ok = 1; + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void) if (!local_apic_timer_verify_ok) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() == 1) - return; - } else { - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; } + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } @@ -693,44 +697,44 @@ void clear_local_APIC(void) */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif /* * Clean APIC state for other OSs: */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + apic_write(APIC_LVTPC, APIC_LVT_MASKED); #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); #endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { @@ -756,7 +760,7 @@ void disable_local_APIC(void) */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * When LAPIC was disabled by the BIOS and enabled by the kernel, @@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -902,16 +906,16 @@ void __init init_bsp_APIC(void) else value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up the virtual wire mode. */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } static void __cpuinit lapic_setup_esr(void) @@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void) /* enables sending errors */ value = ERROR_APIC_VECTOR; - apic_write_around(APIC_LVTERR, value); + apic_write(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. */ @@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void) */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); + apic_write(APIC_TASKPRI, value); /* * After a crash, we no longer service the interrupts and a pending @@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void) * Set spurious IRQ vector */ value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up LVT0, LVT1: @@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void) apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); } - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); /* * only the BP should see the LINT1 NMI signal, obviously. @@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI | APIC_LVT_MASKED; if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } void __cpuinit end_local_APIC_setup(void) @@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void) /* Disable the local apic timer */ value = apic_read(APIC_LVTT); value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, value); + apic_write(APIC_LVTT, value); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); @@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { - if (disable_apic) - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - if (!smp_found_config && !cpu_has_apic) return -1; @@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; value |= 0xf; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); if (!virt_wire_setup) { /* @@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); } else { /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); } /* @@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } } @@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic); static int __init parse_nolapic(char *arg) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("nolapic", parse_nolapic); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c14..7f1f030da7ee 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; /* Have we found an MP table */ int smp_found_config; @@ -314,7 +314,7 @@ static void setup_APIC_timer(void) #define TICK_COUNT 100000000 -static void __init calibrate_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { unsigned apic, apic_start; unsigned long tsc, tsc_start; @@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void) clockevent_delta2ns(0xF, &lapic_clockevent); calibration_result = result / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; } /* @@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void) } printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) setup_APIC_timer(); @@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bacf5deeec2d..aa89387006fe 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -18,6 +18,8 @@ #include <asm/ia32.h> #include <asm/bootparam.h> +#include <xen/interface/xen.h> + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_64_UNISTD_H_ @@ -131,5 +133,14 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#undef ENTRY +#endif return 0; } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c new file mode 100644 index 000000000000..c639bd55391c --- /dev/null +++ b/arch/x86/kernel/bios_uv.c @@ -0,0 +1,48 @@ +/* + * BIOS run time interface routines. + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/uv/bios.h> + +const char * +x86_bios_strerror(long status) +{ + const char *str; + switch (status) { + case 0: str = "Call completed without error"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + default: str = "Unknown BIOS status code"; break; + } + return str; +} + +long +x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + struct uv_bios_retval isrv; + + BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); + *ticks_per_second = isrv.v0; + *drift_info = isrv.v1; + return isrv.status; +} +EXPORT_SYMBOL_GPL(x86_bios_freq_base); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 7c36fb8a28d4..d1692b2a41ff 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSCALL32); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b1c56bb338f..c9b58a806e85 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -131,13 +131,7 @@ static void __init check_popad(void) * (for due to lack of "invlpg" and working WP on a i386) * - In order to run on anything without a TSC, we need to be * compiled for a i486. - * - In order to support the local APIC on a buggy Pentium machine, - * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, - * which happens implicitly if compiled for a Pentium or lower - * (unless an advanced selection of CPU features is used) as an - * otherwise config implies a properly working local APIC without - * the need to do extra reads from the APIC. -*/ + */ static void __init check_config(void) { @@ -151,21 +145,6 @@ static void __init check_config(void) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif - -/* - * If we were told we had a good local APIC, check for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL - && cpu_has_apic - && boot_cpu_data.x86 == 5 - && boot_cpu_data.x86_model == 2 - && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) - panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif } diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 2a4475beea4a..dd6e3f15017e 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -13,6 +13,7 @@ #include <asm/i387.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/linkage.h> #include <asm/mmu_context.h> #include <asm/mtrr.h> #include <asm/mce.h> @@ -302,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } - c->extended_cpuid_level = cpuid_eax(0x80000000); if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); @@ -313,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); - - /* early_param could clear that, but recall get it set again */ - if (disable_apic) - clear_cpu_cap(c, X86_FEATURE_APIC); } /* @@ -514,8 +507,7 @@ void pda_init(int cpu) } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 70609efdf1da..b75f2569b8f8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 2c8afafa18e8..ff517f0b8cc4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); - break; + return retval; } kobject_uevent(&(this_object->kobj), KOBJ_ADD); } - if (!retval) - cpu_set(cpu, cache_dev_map); + cpu_set(cpu, cache_dev_map); kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); - return retval; + return 0; } static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index eef001ad3bde..9b60fce09f75 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* The temperature transition interrupt handler setup */ h = THERMAL_APIC_VECTOR; /* our delivery vector */ h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write_around(APIC_LVTTHMR, h); + apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); @@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a0e11c0cc872..4353cf5e6fac 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -16,10 +16,7 @@ #include <asm/dma.h> #include <asm/io_apic.h> #include <asm/apic.h> - -#ifdef CONFIG_GART_IOMMU -#include <asm/gart.h> -#endif +#include <asm/iommu.h> static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f1202..cdfd94cc6b14 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -332,7 +332,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -370,7 +370,7 @@ ENTRY(system_call) GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -383,10 +383,6 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work @@ -514,12 +510,8 @@ END(work_pending) syscall_trace_entry: movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax - xorl %edx,%edx - call do_syscall_trace - cmpl $0, %eax - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, - # so must skip actual syscall - movl PT_ORIG_EAX(%esp), %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -528,14 +520,13 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + testb $_TIF_WORK_SYSCALL_EXIT, %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call # schedule() instead movl %esp, %eax - movl $1, %edx - call do_syscall_trace + call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) CFI_ENDPROC @@ -1024,6 +1015,7 @@ ENDPROC(kernel_thread_helper) ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ + CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c340..8410e26f4183 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -430,7 +429,12 @@ tracesys: FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 RESTORE_REST cmpq $__NR_syscall_max,%rax ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ @@ -483,7 +487,7 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -491,7 +495,7 @@ int_very_careful: call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest int_signal: @@ -1189,6 +1193,7 @@ END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK @@ -1198,6 +1203,7 @@ KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi, 0, 0 @@ -1211,6 +1217,7 @@ KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK @@ -1237,6 +1244,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault jmp paranoid_exit1 CFI_ENDPROC @@ -1253,6 +1261,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment jmp paranoid_exit1 CFI_ENDPROC @@ -1278,6 +1287,7 @@ END(spurious_interrupt_bug) /* runs on exception stack */ ENTRY(machine_check) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check @@ -1312,3 +1322,103 @@ KPROBE_ENTRY(ignore_sysret) sysret CFI_ENDPROC ENDPROC(ignore_sysret) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + zeroentry xen_do_hypervisor_callback +END(xen_hypervisor_callback) + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + see the correct pointer to the pt_regs */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +/* +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +*/ +ENTRY(xen_failsafe_callback) + framesz = (RIP-0x30) /* workaround buggy gas */ + _frame framesz + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + jmp general_protection + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +END(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b06..3c3929340692 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -24,6 +24,7 @@ #include <asm/pgtable.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> +#include <asm/uv/bios.h> DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ static cpumask_t uv_target_cpus(void) @@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, map_uc); } +static __init void uv_rtc_init(void) +{ + long status, ticks_per_sec, drift; + + status = + x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, + &drift); + if (status != 0 || ticks_per_sec < 100000) { + printk(KERN_WARNING + "unable to determine platform RTC clock frequency, " + "guessing.\n"); + /* BIOS gives wrong value for clock freq. so guess */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else + sn_rtc_cycles_per_second = ticks_per_sec; +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; @@ -326,6 +347,8 @@ static __init void uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_rtc_init(); + for_each_present_cpu(cpu) { nid = cpu_to_node(cpu); pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c97819829146..1b318e903bf6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif +void __init x86_64_init_pda(void) +{ + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; + pda_init(0); +} + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); + x86_64_init_pda(); early_printk("Kernel really alive\n"); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07ac7b217cb..db3280afe886 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -407,6 +407,7 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#include "../../x86/xen/xen-head.S" .section .bss, "aw", @nobits .align L1_CACHE_BYTES diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 558abf4c796a..de9aa0e3a9c5 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -756,7 +756,7 @@ void send_IPI_self(int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } #endif /* !CONFIG_SMP */ @@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(unsigned int irq) @@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static struct irq_chip lapic_chip __read_mostly = { @@ -2168,7 +2168,7 @@ static inline void __init check_timer(void) * The AEOI mode will finish them in the 8259A * automatically. */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); @@ -2177,8 +2177,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2216,12 +2217,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - printk(KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); - printk(KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - printk("\n..... (found pin %d) ...", pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2230,7 +2232,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - printk("works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -2244,44 +2246,47 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - printk(" failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } timer_ack = 0; - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0, vector); - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - printk(" failed.\n"); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - printk(" failed :(.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option"); + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 6510cde36b35..64a46affd858 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -45,6 +45,7 @@ #include <asm/proto.h> #include <asm/acpi.h> #include <asm/dma.h> +#include <asm/i8259.h> #include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> @@ -1696,8 +1697,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -1735,14 +1737,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " "8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_VERBOSE,KERN_INFO - "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -1751,7 +1752,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -1765,29 +1766,32 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -1796,11 +1800,12 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 9d98cda39ad9..3f7537b669d3 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } void send_IPI_self(int vector) @@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 47a6f6f12478..1cf8c1fcc088 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,11 +83,8 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index c03205991718..f2d43bc75514 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -12,9 +12,13 @@ #include <linux/init.h> #include <linux/io.h> #include <linux/mm.h> +#include <linux/module.h> #include <asm/setup.h> +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + #ifdef CONFIG_DEBUG_BOOT_PARAMS struct setup_data_node { u64 paddr; @@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void) { int error = 0; + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + if (!arch_debugfs_dir) + return -ENOMEM; + #ifdef CONFIG_DEBUG_BOOT_PARAMS error = boot_params_kdebugfs_init(); #endif diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b8c6743a13da..43c019f85f0d 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->flags |= kcb->kprobe_saved_flags; - trace_hardirqs_fixup_flags(regs->flags); if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 87edf1ceb1df..d02def06ca91 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void) #endif #ifdef CONFIG_SMP -void __init kvm_smp_prepare_boot_cpu(void) +static void __init kvm_smp_prepare_boot_cpu(void) { WARN_ON(kvm_register_clock("primary cpu clock")); native_smp_prepare_boot_cpu(); diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index a888e67f5874..0e867676b5a5 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return module_bug_finalize(hdr, sechdrs, me); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad0..ac6d51222e7d 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI); + apic_write(APIC_LVT0, APIC_DM_NMI); } /* @@ -277,7 +277,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* @@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) { unsigned char reason = get_nmi_reason(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c19..b4564d089b43 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include <asm/desc.h> #include <asm/setup.h> #include <asm/arch_hooks.h> +#include <asm/pgtable.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> @@ -361,7 +362,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, - .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, @@ -373,6 +373,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6959b5c45df4..151f2d171f7c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -36,7 +36,7 @@ #include <linux/delay.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 702714bd1511..a4213c00dffc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -5,7 +5,7 @@ #include <asm/proto.h> #include <asm/dma.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/amd_iommu.h> @@ -113,21 +113,15 @@ void __init pci_iommu_alloc(void) * The order of these functions is important for * fall-back/fail-over reasons */ -#ifdef CONFIG_GART_IOMMU gart_iommu_hole_init(); -#endif -#ifdef CONFIG_CALGARY_IOMMU detect_calgary(); -#endif detect_intel_iommu(); amd_iommu_detect(); -#ifdef CONFIG_SWIOTLB pci_swiotlb_init(); -#endif } #endif @@ -183,9 +177,7 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif -#ifdef CONFIG_GART_IOMMU gart_parse_options(p); -#endif #ifdef CONFIG_CALGARY_IOMMU if (!strncmp(p, "calgary", 7)) @@ -499,17 +491,13 @@ EXPORT_SYMBOL(dma_free_coherent); static int __init pci_iommu_init(void) { -#ifdef CONFIG_CALGARY_IOMMU calgary_iommu_init(); -#endif intel_iommu_init(); amd_iommu_init(); -#ifdef CONFIG_GART_IOMMU gart_iommu_init(); -#endif no_iommu_init(); return 0; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c3fe78406d18..be60961f8695 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -32,6 +32,7 @@ #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/gart.h> #include <asm/cacheflush.h> #include <asm/swiotlb.h> diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index aec43d56f49c..792b9179eff3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -7,7 +7,7 @@ #include <linux/dma-mapping.h> #include <linux/scatterlist.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/processor.h> #include <asm/dma.h> diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 82299cd1d04d..20df839b9c20 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/dma-mapping.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9f94bb1c8117..7fc4d5b0a6a0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a8e53626ac9a..e8a8e1b99817 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; @@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. */ - { - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; /* Must be after DS reload */ unlazy_fpu(prev_p); @@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) write_pda(pcurrent, next_p); write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77040b6070e1..e37dccce85db 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -#ifdef CONFIG_X86_32 - void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) { struct siginfo info; @@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_ax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), - regs->ax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, - regs->bx, regs->cx, regs->dx, regs->si); - if (ret == 0) - return 0; - - regs->orig_ax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - return 1; -} - -#else /* CONFIG_X86_64 */ - static void syscall_trace(struct pt_regs *regs) { + if (!(current->ptrace & PT_PTRACED)) + return; #if 0 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", @@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs) } } -asmlinkage void syscall_trace_enter(struct pt_regs *regs) +#ifdef CONFIG_X86_32 +# define IS_IA32 1 +#elif defined CONFIG_IA32_EMULATION +# define IS_IA32 test_thread_flag(TIF_IA32) +#else +# define IS_IA32 0 +#endif + +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +asmregparm long syscall_trace_enter(struct pt_regs *regs) { + long ret = 0; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (test_thread_flag(TIF_SINGLESTEP)) + regs->flags |= X86_EFLAGS_TF; + /* do the secure computing check first */ secure_computing(regs->orig_ax); - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + ret = -1L; + + if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { + if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, regs->bx, regs->cx, regs->dx, regs->si); - } else { +#ifdef CONFIG_X86_64 + else audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_ax, regs->di, regs->si, regs->dx, regs->r10); - } +#endif } + + return ret ?: regs->orig_ax; } -asmlinkage void syscall_trace_leave(struct pt_regs *regs) +asmregparm void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); -} -#endif /* CONFIG_X86_32 */ + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(), so don't do any more now. + */ + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + return; + + /* + * If we are single-stepping, synthesize a trap to follow the + * system call instruction. + */ + if (test_thread_flag(TIF_SINGLESTEP) && + (current->ptrace & PT_PTRACED)) + send_sigtrap(current, regs, 0); +} diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e151..9dcf39c02972 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, { /* Handle problems with rebooting on HP laptops */ .callback = set_bios_reboot, .ident = "HP Compaq Laptop", diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4064616cfa85..ec952aa5394a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -92,7 +92,7 @@ #include <asm/smp.h> #include <asm/desc.h> #include <asm/dma.h> -#include <asm/gart.h> +#include <asm/iommu.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -823,7 +823,10 @@ void __init setup_arch(char **cmdline_p) vmi_init(); #endif + paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); + paravirt_pagetable_setup_done(swapper_pg_dir); + paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d633d801f858..07faaa5109cb 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e7..bf87684474f1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a9ca7dadc852..27640196eb7c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); Dprintk("NMI sent.\n"); @@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return send_status; } + maxlvt = lapic_get_maxlvt(); + /* * Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(apic_version[phys_apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = lapic_get_maxlvt(); - for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n", j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); Dprintk("After apic_write.\n"); @@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); if (send_status || accept_status) break; @@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) * * Must be called after the _cpu_pda pointer table is initialized. */ -static int __cpuinit get_local_pda(int cpu) +int __cpuinit get_local_pda(int cpu) { struct x8664_pda *oldpda, *newpda; unsigned long size = sizeof(struct x8664_pda); @@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ - clear_bit(cpu, (unsigned long *)&cpu_initialized); + cpu_clear(cpu, cpu_initialized); numa_remove_cpu(cpu); } diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c deleted file mode 100644 index 8b137891791f..000000000000 --- a/arch/x86/kernel/smpcommon_32.c +++ /dev/null @@ -1 +0,0 @@ - diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 92c20fee6781..e8b9863ef8c4 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; /* * Always set TIF_SINGLESTEP - this guarantees that @@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child) */ set_tsk_thread_flag(child, TIF_SINGLESTEP); - /* - * If TF was already set, don't do anything else - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; + oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; @@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. */ - if (is_setting_trap_flag(child, regs)) + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 8a768973c4f0..03df8e45e5a1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -58,6 +58,7 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/io.h> +#include <asm/traps.h> #include "mach_traps.h" @@ -77,26 +78,6 @@ char ignore_fpu_irq; gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; @@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); @@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(signr); +} + int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) */ void die(const char *str, struct pt_regs *regs, long err) { - static struct { - raw_spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 - }; - unsigned long flags; - - oops_enter(); - - if (die.lock_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } + unsigned long flags = oops_begin(); - if (++die.lock_owner_depth < 3) { + if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) @@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err) printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - bust_spinlocks(0); - die.lock_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(SIGSEGV); + oops_end(flags, regs, SIGSEGV); } static inline void diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 2696a6837782..3f18d73f420c 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -51,30 +51,10 @@ #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/pda.h> +#include <asm/traps.h> #include <mach_traps.h> -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; @@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) { printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("\n"); } +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + static void -_show_stack(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp) +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(task, regs, sp, bp); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(task, NULL, sp, 0); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, ""); printk("\n"); printk(KERN_EMERG "Code: "); @@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void) } void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ +{ die_owner = -1; bust_spinlocks(0); die_nest_count--; diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b15346092b7b..0a1b1a9d922d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -906,7 +906,6 @@ static inline int __init activate_vmi(void) #ifdef CONFIG_X86_LOCAL_APIC para_fill(pv_apic_ops.apic_read, APICRead); para_fill(pv_apic_ops.apic_write, APICWrite); - para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* |