diff options
Diffstat (limited to 'arch/x86/kernel')
43 files changed, 2395 insertions, 1347 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a142e77693e1..d1626364a28a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -76,6 +76,19 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +/* + * Locks related to IOAPIC hotplug + * Hotplug side: + * ->device_hotplug_lock + * ->acpi_ioapic_lock + * ->ioapic_lock + * Interrupt mapping side: + * ->acpi_ioapic_lock + * ->ioapic_mutex + * ->ioapic_lock + */ +static DEFINE_MUTEX(acpi_ioapic_lock); + /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -395,10 +408,6 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_gbl_FADT.sci_interrupt == gsi) - return mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC); - trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1; polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1; node = dev ? dev_to_node(dev) : NUMA_NO_NODE; @@ -411,7 +420,8 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger, if (irq < 0) return irq; - if (enable_update_mptable) + /* Don't set up the ACPI SCI because it's already set up */ + if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi) mp_config_acpi_gsi(dev, gsi, trigger, polarity); return irq; @@ -424,9 +434,6 @@ static void mp_unregister_gsi(u32 gsi) if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return; - if (acpi_gbl_FADT.sci_interrupt == gsi) - return; - irq = mp_map_gsi_to_irq(gsi, 0); if (irq > 0) mp_unmap_irq(irq); @@ -609,8 +616,10 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { *irqp = gsi; } else { + mutex_lock(&acpi_ioapic_lock); irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); + mutex_unlock(&acpi_ioapic_lock); if (irq < 0) return -1; *irqp = irq; @@ -650,7 +659,9 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, int irq = gsi; #ifdef CONFIG_X86_IO_APIC + mutex_lock(&acpi_ioapic_lock); irq = mp_register_gsi(dev, gsi, trigger, polarity); + mutex_unlock(&acpi_ioapic_lock); #endif return irq; @@ -659,7 +670,9 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, static void acpi_unregister_gsi_ioapic(u32 gsi) { #ifdef CONFIG_X86_IO_APIC + mutex_lock(&acpi_ioapic_lock); mp_unregister_gsi(gsi); + mutex_unlock(&acpi_ioapic_lock); #endif } @@ -690,6 +703,7 @@ void acpi_unregister_gsi(u32 gsi) } EXPORT_SYMBOL_GPL(acpi_unregister_gsi); +#ifdef CONFIG_X86_LOCAL_APIC static void __init acpi_set_irq_model_ioapic(void) { acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; @@ -697,6 +711,7 @@ static void __init acpi_set_irq_model_ioapic(void) __acpi_unregister_gsi = acpi_unregister_gsi_ioapic; acpi_ioapic = 1; } +#endif /* * ACPI based hotplug support for CPU @@ -735,13 +750,13 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) } /* wrapper to silence section mismatch warning */ -int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu) { return _acpi_map_lsapic(handle, physid, pcpu); } -EXPORT_SYMBOL(acpi_map_lsapic); +EXPORT_SYMBOL(acpi_map_cpu); -int acpi_unmap_lsapic(int cpu) +int acpi_unmap_cpu(int cpu) { #ifdef CONFIG_ACPI_NUMA set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); @@ -753,26 +768,79 @@ int acpi_unmap_lsapic(int cpu) return (0); } - -EXPORT_SYMBOL(acpi_unmap_lsapic); +EXPORT_SYMBOL(acpi_unmap_cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) { - /* TBD */ - return -EINVAL; -} + int ret = -ENOSYS; +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + int ioapic_id; + u64 addr; + struct ioapic_domain_cfg cfg = { + .type = IOAPIC_DOMAIN_DYNAMIC, + .ops = &acpi_irqdomain_ops, + }; + + ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr); + if (ioapic_id < 0) { + unsigned long long uid; + acpi_status status; + + status = acpi_evaluate_integer(handle, METHOD_NAME__UID, + NULL, &uid); + if (ACPI_FAILURE(status)) { + acpi_handle_warn(handle, "failed to get IOAPIC ID.\n"); + return -EINVAL; + } + ioapic_id = (int)uid; + } + mutex_lock(&acpi_ioapic_lock); + ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} EXPORT_SYMBOL(acpi_register_ioapic); int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) { - /* TBD */ - return -EINVAL; -} + int ret = -ENOSYS; + +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + mutex_lock(&acpi_ioapic_lock); + ret = mp_unregister_ioapic(gsi_base); + mutex_unlock(&acpi_ioapic_lock); +#endif + return ret; +} EXPORT_SYMBOL(acpi_unregister_ioapic); +/** + * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base + * has been registered + * @handle: ACPI handle of the IOAPIC deivce + * @gsi_base: GSI base associated with the IOAPIC + * + * Assume caller holds some type of lock to serialize acpi_ioapic_registered() + * with acpi_register_ioapic()/acpi_unregister_ioapic(). + */ +int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base) +{ + int ret = 0; + +#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC + mutex_lock(&acpi_ioapic_lock); + ret = mp_ioapic_registered(gsi_base); + mutex_unlock(&acpi_ioapic_lock); +#endif + + return ret; +} + static int __init acpi_parse_sbf(struct acpi_table_header *table) { struct acpi_table_boot *sb; @@ -1185,7 +1253,9 @@ static void __init acpi_process_madt(void) /* * Parse MADT IO-APIC entries */ + mutex_lock(&acpi_ioapic_lock); error = acpi_parse_madt_ioapic_entries(); + mutex_unlock(&acpi_ioapic_lock); if (!error) { acpi_set_irq_model_ioapic(); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index dcb5b15401ce..8bb12ddc5db8 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,10 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_PCI_MSI) += msi.o +obj-$(CONFIG_HT_IRQ) += htirq.o obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ba6cc041edb1..29b5b18afa27 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -196,7 +196,7 @@ static int disable_apic_timer __initdata; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -int first_system_vector = 0xfe; +int first_system_vector = FIRST_SYSTEM_VECTOR; /* * Debug level, exported for io_apic.c @@ -1930,7 +1930,7 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -static inline void __smp_spurious_interrupt(void) +static inline void __smp_spurious_interrupt(u8 vector) { u32 v; @@ -1939,30 +1939,32 @@ static inline void __smp_spurious_interrupt(void) * if it is a vectored one. Just in case... * Spurious interrupts should not be ACKed. */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); + if (v & (1 << (vector & 0x1f))) ack_APIC_irq(); inc_irq_stat(irq_spurious_count); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - pr_info("spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); + pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " + "should never happen.\n", vector, smp_processor_id()); } __visible void smp_spurious_interrupt(struct pt_regs *regs) { entering_irq(); - __smp_spurious_interrupt(); + __smp_spurious_interrupt(~regs->orig_ax); exiting_irq(); } __visible void smp_trace_spurious_interrupt(struct pt_regs *regs) { + u8 vector = ~regs->orig_ax; + entering_irq(); - trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); - __smp_spurious_interrupt(); - trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR); + trace_spurious_apic_entry(vector); + __smp_spurious_interrupt(vector); + trace_spurious_apic_exit(vector); exiting_irq(); } diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c new file mode 100644 index 000000000000..816f36e979ad --- /dev/null +++ b/arch/x86/kernel/apic/htirq.c @@ -0,0 +1,107 @@ +/* + * Support Hypertransport IRQ + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/device.h> +#include <linux/pci.h> +#include <linux/htirq.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/hypertransport.h> + +/* + * Hypertransport interrupt support + */ +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static int +ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = irqd_cfg(data); + unsigned int dest; + int ret; + + ret = apic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + target_ht_irq(data->irq, dest, cfg->vector); + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .irq_mask = mask_ht_irq, + .irq_unmask = unmask_ht_irq, + .irq_ack = apic_ack_edge, + .irq_set_affinity = ht_set_affinity, + .irq_retrigger = apic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg; + struct ht_irq_msg msg; + unsigned dest; + int err; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + + dev_dbg(&dev->dev, "irq %d for HT\n", irq); + + return 0; +} diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6a1e71bde323..6873ab925d00 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -18,6 +18,7 @@ #include <linux/nmi.h> #include <linux/module.h> #include <linux/delay.h> +#include <linux/seq_buf.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(int watchdog_thresh) @@ -29,14 +30,35 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #ifdef arch_trigger_all_cpu_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; +static cpumask_t printtrace_mask; + +#define NMI_BUF_SIZE 4096 + +struct nmi_seq_buf { + unsigned char buffer[NMI_BUF_SIZE]; + struct seq_buf seq; +}; + +/* Safe printing in NMI context */ +static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; +static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +{ + const char *buf = s->buffer + start; + + printk("%.*s", (end - start) + 1, buf); +} + void arch_trigger_all_cpu_backtrace(bool include_self) { + struct nmi_seq_buf *s; + int len; + int cpu; int i; - int cpu = get_cpu(); + int this_cpu = get_cpu(); if (test_and_set_bit(0, &backtrace_flag)) { /* @@ -49,7 +71,17 @@ void arch_trigger_all_cpu_backtrace(bool include_self) cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); if (!include_self) - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + + cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); + /* + * Set up per_cpu seq_buf buffers that the NMIs running on the other + * CPUs will write to. + */ + for_each_cpu(cpu, to_cpumask(backtrace_mask)) { + s = &per_cpu(nmi_print_seq, cpu); + seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); + } if (!cpumask_empty(to_cpumask(backtrace_mask))) { pr_info("sending NMI to %s CPUs:\n", @@ -65,11 +97,58 @@ void arch_trigger_all_cpu_backtrace(bool include_self) touch_softlockup_watchdog(); } + /* + * Now that all the NMIs have triggered, we can dump out their + * back traces safely to the console. + */ + for_each_cpu(cpu, &printtrace_mask) { + int last_i = 0; + + s = &per_cpu(nmi_print_seq, cpu); + len = seq_buf_used(&s->seq); + if (!len) + continue; + + /* Print line by line. */ + for (i = 0; i < len; i++) { + if (s->buffer[i] == '\n') { + print_seq_line(s, last_i, i); + last_i = i + 1; + } + } + /* Check if there was a partial line. */ + if (last_i < len) { + print_seq_line(s, last_i, len - 1); + pr_cont("\n"); + } + } + clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); put_cpu(); } +/* + * It is not safe to call printk() directly from NMI handlers. + * It may be fine if the NMI detected a lock up and we have no choice + * but to do so, but doing a NMI on all other CPUs to get a back trace + * can be done with a sysrq-l. We don't want that to lock up, which + * can happen if the NMI interrupts a printk in progress. + * + * Instead, we redirect the vprintk() to this nmi_vprintk() that writes + * the content into a per cpu seq_buf buffer. Then when the NMIs are + * all done, we can safely dump the contents of the seq_buf to a printk() + * from a non NMI context. + */ +static int nmi_vprintk(const char *fmt, va_list args) +{ + struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); + unsigned int len = seq_buf_used(&s->seq); + + seq_buf_vprintf(&s->seq, fmt, args); + return seq_buf_used(&s->seq) - len; +} + static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { @@ -78,12 +157,14 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + printk_func_t printk_func_save = this_cpu_read(printk_func); - arch_spin_lock(&lock); + /* Replace printk to write into the NMI seq */ + this_cpu_write(printk_func, nmi_vprintk); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - arch_spin_unlock(&lock); + this_cpu_write(printk_func, printk_func_save); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NMI_HANDLED; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7ffe0a2b870f..3f5f60406ab1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -32,15 +32,11 @@ #include <linux/module.h> #include <linux/syscore_ops.h> #include <linux/irqdomain.h> -#include <linux/msi.h> -#include <linux/htirq.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/jiffies.h> /* time_after() */ #include <linux/slab.h> #include <linux/bootmem.h> -#include <linux/dmar.h> -#include <linux/hpet.h> #include <asm/idle.h> #include <asm/io.h> @@ -52,17 +48,12 @@ #include <asm/dma.h> #include <asm/timer.h> #include <asm/i8259.h> -#include <asm/msidef.h> -#include <asm/hypertransport.h> #include <asm/setup.h> #include <asm/irq_remapping.h> -#include <asm/hpet.h> #include <asm/hw_irq.h> #include <asm/apic.h> -#define __apicdebuginit(type) static type __init - #define for_each_ioapic(idx) \ for ((idx) = 0; (idx) < nr_ioapics; (idx)++) #define for_each_ioapic_reverse(idx) \ @@ -74,7 +65,7 @@ for_each_pin((idx), (pin)) #define for_each_irq_pin(entry, head) \ - for (entry = head; entry; entry = entry->next) + list_for_each_entry(entry, &head, list) /* * Is the SiS APIC rmw bug present ? @@ -83,7 +74,6 @@ int sis_apic_bug = -1; static DEFINE_RAW_SPINLOCK(ioapic_lock); -static DEFINE_RAW_SPINLOCK(vector_lock); static DEFINE_MUTEX(ioapic_mutex); static unsigned int ioapic_dynirq_base; static int ioapic_initialized; @@ -112,6 +102,7 @@ static struct ioapic { struct ioapic_domain_cfg irqdomain_cfg; struct irq_domain *irqdomain; struct mp_pin_info *pin_info; + struct resource *iomem_res; } ioapics[MAX_IO_APICS]; #define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver @@ -205,8 +196,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); - /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) { @@ -228,8 +217,8 @@ void mp_save_irq(struct mpc_intsrc *m) } struct irq_pin_list { + struct list_head list; int apic, pin; - struct irq_pin_list *next; }; static struct irq_pin_list *alloc_irq_pin_list(int node) @@ -237,7 +226,26 @@ static struct irq_pin_list *alloc_irq_pin_list(int node) return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); } -int __init arch_early_irq_init(void) +static void alloc_ioapic_saved_registers(int idx) +{ + size_t size; + + if (ioapics[idx].saved_registers) + return; + + size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers; + ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL); + if (!ioapics[idx].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", idx); +} + +static void free_ioapic_saved_registers(int idx) +{ + kfree(ioapics[idx].saved_registers); + ioapics[idx].saved_registers = NULL; +} + +int __init arch_early_ioapic_init(void) { struct irq_cfg *cfg; int i, node = cpu_to_node(0); @@ -245,13 +253,8 @@ int __init arch_early_irq_init(void) if (!nr_legacy_irqs()) io_apic_irqs = ~0UL; - for_each_ioapic(i) { - ioapics[i].saved_registers = - kzalloc(sizeof(struct IO_APIC_route_entry) * - ioapics[i].nr_registers, GFP_KERNEL); - if (!ioapics[i].saved_registers) - pr_err("IOAPIC %d: suspend/resume impossible!\n", i); - } + for_each_ioapic(i) + alloc_ioapic_saved_registers(i); /* * For legacy IRQ's, start with assigning irq0 to irq15 to @@ -266,61 +269,6 @@ int __init arch_early_irq_init(void) return 0; } -static inline struct irq_cfg *irq_cfg(unsigned int irq) -{ - return irq_get_chip_data(irq); -} - -static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - - cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); - if (!cfg) - return NULL; - if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) - goto out_cfg; - if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) - goto out_domain; - return cfg; -out_domain: - free_cpumask_var(cfg->domain); -out_cfg: - kfree(cfg); - return NULL; -} - -static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) -{ - if (!cfg) - return; - irq_set_chip_data(at, NULL); - free_cpumask_var(cfg->domain); - free_cpumask_var(cfg->old_domain); - kfree(cfg); -} - -static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) -{ - int res = irq_alloc_desc_at(at, node); - struct irq_cfg *cfg; - - if (res < 0) { - if (res != -EEXIST) - return NULL; - cfg = irq_cfg(at); - if (cfg) - return cfg; - } - - cfg = alloc_irq_cfg(at, node); - if (cfg) - irq_set_chip_data(at, cfg); - else - irq_free_desc(at); - return cfg; -} - struct io_apic { unsigned int index; unsigned int unused[3]; @@ -445,15 +393,12 @@ static void ioapic_mask_entry(int apic, int pin) */ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list **last, *entry; + struct irq_pin_list *entry; /* don't allow duplicates */ - last = &cfg->irq_2_pin; - for_each_irq_pin(entry, cfg->irq_2_pin) { + for_each_irq_pin(entry, cfg->irq_2_pin) if (entry->apic == apic && entry->pin == pin) return 0; - last = &entry->next; - } entry = alloc_irq_pin_list(node); if (!entry) { @@ -464,22 +409,19 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi entry->apic = apic; entry->pin = pin; - *last = entry; + list_add_tail(&entry->list, &cfg->irq_2_pin); return 0; } static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin) { - struct irq_pin_list **last, *entry; + struct irq_pin_list *tmp, *entry; - last = &cfg->irq_2_pin; - for_each_irq_pin(entry, cfg->irq_2_pin) + list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list) if (entry->apic == apic && entry->pin == pin) { - *last = entry->next; + list_del(&entry->list); kfree(entry); return; - } else { - last = &entry->next; } } @@ -559,7 +501,7 @@ static void mask_ioapic(struct irq_cfg *cfg) static void mask_ioapic_irq(struct irq_data *data) { - mask_ioapic(data->chip_data); + mask_ioapic(irqd_cfg(data)); } static void __unmask_ioapic(struct irq_cfg *cfg) @@ -578,7 +520,7 @@ static void unmask_ioapic(struct irq_cfg *cfg) static void unmask_ioapic_irq(struct irq_data *data) { - unmask_ioapic(data->chip_data); + unmask_ioapic(irqd_cfg(data)); } /* @@ -1164,8 +1106,7 @@ void mp_unmap_irq(int irq) * Find a specific PCI IRQ entry. * Not an __init, possibly needed by modules */ -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, - struct io_apic_irq_attr *irq_attr) +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) { int irq, i, best_ioapic = -1, best_idx = -1; @@ -1219,195 +1160,11 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, return -1; out: - irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, - IOAPIC_MAP_ALLOC); - if (irq > 0) - set_io_apic_irq_attr(irq_attr, best_ioapic, - mp_irqs[best_idx].dstirq, - irq_trigger(best_idx), - irq_polarity(best_idx)); - return irq; + return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, + IOAPIC_MAP_ALLOC); } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -void lock_vector_lock(void) -{ - /* Used to the online set of cpus does not change - * during assign_irq_vector. - */ - raw_spin_lock(&vector_lock); -} - -void unlock_vector_lock(void) -{ - raw_spin_unlock(&vector_lock); -} - -static int -__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) -{ - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 16; - int cpu, err; - cpumask_var_t tmp_mask; - - if (cfg->move_in_progress) - return -EBUSY; - - if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) - return -ENOMEM; - - /* Only try and allocate irqs on cpus that are present */ - err = -ENOSPC; - cpumask_clear(cfg->old_domain); - cpu = cpumask_first_and(mask, cpu_online_mask); - while (cpu < nr_cpu_ids) { - int new_cpu, vector, offset; - - apic->vector_allocation_domain(cpu, tmp_mask, mask); - - if (cpumask_subset(tmp_mask, cfg->domain)) { - err = 0; - if (cpumask_equal(tmp_mask, cfg->domain)) - break; - /* - * New cpumask using the vector is a proper subset of - * the current in use mask. So cleanup the vector - * allocation for the members that are not used anymore. - */ - cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - cpumask_and(cfg->domain, cfg->domain, tmp_mask); - break; - } - - vector = current_vector; - offset = current_offset; -next: - vector += 16; - if (vector >= first_system_vector) { - offset = (offset + 1) % 16; - vector = FIRST_EXTERNAL_VECTOR + offset; - } - - if (unlikely(current_vector == vector)) { - cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); - cpumask_andnot(tmp_mask, mask, cfg->old_domain); - cpu = cpumask_first_and(tmp_mask, cpu_online_mask); - continue; - } - - if (test_bit(vector, used_vectors)) - goto next; - - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { - if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) - goto next; - } - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (cfg->vector) { - cpumask_copy(cfg->old_domain, cfg->domain); - cfg->move_in_progress = - cpumask_intersects(cfg->old_domain, cpu_online_mask); - } - for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cpumask_copy(cfg->domain, tmp_mask); - err = 0; - break; - } - free_cpumask_var(tmp_mask); - return err; -} - -int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) -{ - int err; - unsigned long flags; - - raw_spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, cfg, mask); - raw_spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -static void __clear_irq_vector(int irq, struct irq_cfg *cfg) -{ - int cpu, vector; - - BUG_ON(!cfg->vector); - - vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - - cfg->vector = 0; - cpumask_clear(cfg->domain); - - if (likely(!cfg->move_in_progress)) - return; - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - if (per_cpu(vector_irq, cpu)[vector] != irq) - continue; - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - break; - } - } - cfg->move_in_progress = 0; -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - int irq, vector; - struct irq_cfg *cfg; - - /* - * vector_lock will make sure that we don't run into irq vector - * assignments that might be happening on another cpu in parallel, - * while we setup our initial vector to irq mappings. - */ - raw_spin_lock(&vector_lock); - /* Mark the inuse vectors */ - for_each_active_irq(irq) { - cfg = irq_cfg(irq); - if (!cfg) - continue; - - if (!cpumask_test_cpu(cpu, cfg->domain)) - continue; - vector = cfg->vector; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq <= VECTOR_UNDEFINED) - continue; - - cfg = irq_cfg(irq); - if (!cpumask_test_cpu(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; - } - raw_spin_unlock(&vector_lock); -} - static struct irq_chip ioapic_chip; #ifdef CONFIG_X86_32 @@ -1496,7 +1253,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, &dest)) { pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - __clear_irq_vector(irq, cfg); + clear_irq_vector(irq, cfg); return; } @@ -1510,7 +1267,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); - __clear_irq_vector(irq, cfg); + clear_irq_vector(irq, cfg); return; } @@ -1641,7 +1398,7 @@ void ioapic_zap_locks(void) raw_spin_lock_init(&ioapic_lock); } -__apicdebuginit(void) print_IO_APIC(int ioapic_idx) +static void __init print_IO_APIC(int ioapic_idx) { union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; @@ -1698,7 +1455,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); } -__apicdebuginit(void) print_IO_APICs(void) +void __init print_IO_APICs(void) { int ioapic_idx; struct irq_cfg *cfg; @@ -1731,8 +1488,7 @@ __apicdebuginit(void) print_IO_APICs(void) cfg = irq_cfg(irq); if (!cfg) continue; - entry = cfg->irq_2_pin; - if (!entry) + if (list_empty(&cfg->irq_2_pin)) continue; printk(KERN_DEBUG "IRQ%d ", irq); for_each_irq_pin(entry, cfg->irq_2_pin) @@ -1743,205 +1499,6 @@ __apicdebuginit(void) print_IO_APICs(void) printk(KERN_INFO ".................................... done.\n"); } -__apicdebuginit(void) print_APIC_field(int base) -{ - int i; - - printk(KERN_DEBUG); - - for (i = 0; i < 8; i++) - pr_cont("%08x", apic_read(base + i*0x10)); - - pr_cont("\n"); -} - -__apicdebuginit(void) print_local_APIC(void *dummy) -{ - unsigned int i, v, ver, maxlvt; - u64 icr; - - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = lapic_get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (!APIC_XAPIC(ver)) { - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - } - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } - - /* - * Remote read supported only in the 82489DX and local APIC for - * Pentium processors. - */ - if (!APIC_INTEGRATED(ver) || maxlvt == 3) { - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - } - - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - if (!x2apic_enabled()) { - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - } - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_field(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_field(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_field(APIC_IRR); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - v = apic_read(APIC_EFEAT); - maxlvt = (v >> 16) & 0xff; - printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); - v = apic_read(APIC_ECTRL); - printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); - for (i = 0; i < maxlvt; i++) { - v = apic_read(APIC_EILVTn(i)); - printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); - } - } - pr_cont("\n"); -} - -__apicdebuginit(void) print_local_APICs(int maxcpu) -{ - int cpu; - - if (!maxcpu) - return; - - preempt_disable(); - for_each_online_cpu(cpu) { - if (cpu >= maxcpu) - break; - smp_call_function_single(cpu, print_local_APIC, NULL, 1); - } - preempt_enable(); -} - -__apicdebuginit(void) print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (!nr_legacy_irqs()) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - raw_spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -static int __initdata show_lapic = 1; -static __init int setup_show_lapic(char *arg) -{ - int num = -1; - - if (strcmp(arg, "all") == 0) { - show_lapic = CONFIG_NR_CPUS; - } else { - get_option(&arg, &num); - if (num >= 0) - show_lapic = num; - } - - return 1; -} -__setup("show_lapic=", setup_show_lapic); - -__apicdebuginit(int) print_ICs(void) -{ - if (apic_verbosity == APIC_QUIET) - return 0; - - print_PIC(); - - /* don't print out if apic is not there */ - if (!cpu_has_apic && !apic_from_smp_config()) - return 0; - - print_local_APICs(show_lapic); - print_IO_APICs(); - - return 0; -} - -late_initcall(print_ICs); - - /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; @@ -2244,26 +1801,12 @@ static unsigned int startup_ioapic_irq(struct irq_data *data) if (legacy_pic->irq_pending(irq)) was_pending = 1; } - __unmask_ioapic(data->chip_data); + __unmask_ioapic(irqd_cfg(data)); raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } -static int ioapic_retrigger_irq(struct irq_data *data) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned long flags; - int cpu; - - raw_spin_lock_irqsave(&vector_lock, flags); - cpu = cpumask_first_and(cfg->domain, cpu_online_mask); - apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} - /* * Level and edge triggered IO-APIC interrupts need different handling, * so we use two separate IRQ descriptors. Edge triggered IRQs can be @@ -2273,113 +1816,6 @@ static int ioapic_retrigger_irq(struct irq_data *data) * races. */ -#ifdef CONFIG_SMP -void send_cleanup_vector(struct irq_cfg *cfg) -{ - cpumask_var_t cleanup_mask; - - if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { - unsigned int i; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); - } else { - cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - free_cpumask_var(cleanup_mask); - } - cfg->move_in_progress = 0; -} - -asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - - ack_APIC_irq(); - irq_enter(); - exit_idle(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - int irq; - unsigned int irr; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __this_cpu_read(vector_irq[vector]); - - if (irq <= VECTOR_UNDEFINED) - continue; - - desc = irq_to_desc(irq); - if (!desc) - continue; - - cfg = irq_cfg(irq); - if (!cfg) - continue; - - raw_spin_lock(&desc->lock); - - /* - * Check if the irq migration is in progress. If so, we - * haven't received the cleanup request yet for this irq. - */ - if (cfg->move_in_progress) - goto unlock; - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - goto unlock; - - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - /* - * Check if the vector that needs to be cleanedup is - * registered at the cpu's IRR. If so, then this is not - * the best time to clean it up. Lets clean it up in the - * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR - * to myself. - */ - if (irr & (1 << (vector % 32))) { - apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); - goto unlock; - } - __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); -unlock: - raw_spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) -{ - unsigned me; - - if (likely(!cfg->move_in_progress)) - return; - - me = smp_processor_id(); - - if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) - send_cleanup_vector(cfg); -} - -static void irq_complete_move(struct irq_cfg *cfg) -{ - __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); -} - -void irq_force_complete_move(int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - if (!cfg) - return; - - __irq_complete_move(cfg, cfg->vector); -} -#else -static inline void irq_complete_move(struct irq_cfg *cfg) { } -#endif - static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) { int apic, pin; @@ -2400,41 +1836,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int irq = data->irq; - int err; - - if (!config_enabled(CONFIG_SMP)) - return -EPERM; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -EINVAL; - - err = assign_irq_vector(irq, cfg, mask); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); - if (err) { - if (assign_irq_vector(irq, cfg, data->affinity)) - pr_err("Failed to recover vector for irq %d\n", irq); - return err; - } - - cpumask_copy(data->affinity, mask); - - return 0; -} - - int native_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) @@ -2447,24 +1848,17 @@ int native_ioapic_set_affinity(struct irq_data *data, return -EPERM; raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = __ioapic_set_affinity(data, mask, &dest); + ret = apic_set_affinity(data, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, data->chip_data); + __target_IO_APIC_irq(irq, dest, irqd_cfg(data)); ret = IRQ_SET_MASK_OK_NOCOPY; } raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } -static void ack_apic_edge(struct irq_data *data) -{ - irq_complete_move(data->chip_data); - irq_move_irq(data); - ack_APIC_irq(); -} - atomic_t irq_mis_count; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -2547,9 +1941,9 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, } #endif -static void ack_apic_level(struct irq_data *data) +static void ack_ioapic_level(struct irq_data *data) { - struct irq_cfg *cfg = data->chip_data; + struct irq_cfg *cfg = irqd_cfg(data); int i, irq = data->irq; unsigned long v; bool masked; @@ -2619,10 +2013,10 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_startup = startup_ioapic_irq, .irq_mask = mask_ioapic_irq, .irq_unmask = unmask_ioapic_irq, - .irq_ack = ack_apic_edge, - .irq_eoi = ack_apic_level, + .irq_ack = apic_ack_edge, + .irq_eoi = ack_ioapic_level, .irq_set_affinity = native_ioapic_set_affinity, - .irq_retrigger = ioapic_retrigger_irq, + .irq_retrigger = apic_retrigger_irq, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2965,6 +2359,16 @@ static int mp_irqdomain_create(int ioapic) return 0; } +static void ioapic_destroy_irqdomain(int idx) +{ + if (ioapics[idx].irqdomain) { + irq_domain_remove(ioapics[idx].irqdomain); + ioapics[idx].irqdomain = NULL; + } + kfree(ioapics[idx].pin_info); + ioapics[idx].pin_info = NULL; +} + void __init setup_IO_APIC(void) { int ioapic; @@ -3044,399 +2448,6 @@ static int __init ioapic_init_ops(void) device_initcall(ioapic_init_ops); -/* - * Dynamic irq allocate and deallocation. Should be replaced by irq domains! - */ -int arch_setup_hwirq(unsigned int irq, int node) -{ - struct irq_cfg *cfg; - unsigned long flags; - int ret; - - cfg = alloc_irq_cfg(irq, node); - if (!cfg) - return -ENOMEM; - - raw_spin_lock_irqsave(&vector_lock, flags); - ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); - raw_spin_unlock_irqrestore(&vector_lock, flags); - - if (!ret) - irq_set_chip_data(irq, cfg); - else - free_irq_cfg(irq, cfg); - return ret; -} - -void arch_teardown_hwirq(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long flags; - - free_remapped_irq(irq); - raw_spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, cfg); - raw_spin_unlock_irqrestore(&vector_lock, flags); - free_irq_cfg(irq, cfg); -} - -/* - * MSI message composition - */ -void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg = irq_cfg(irq); - - msg->address_hi = MSI_ADDR_BASE_HI; - - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); - - msg->address_lo = - MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); -} - -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, - struct msi_msg *msg, u8 hpet_id) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); - - return 0; -} - -static int -msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = data->chip_data; - struct msi_msg msg; - unsigned int dest; - int ret; - - ret = __ioapic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - __get_cached_msi_msg(data->msi_desc, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - __pci_write_msi_msg(data->msi_desc, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_ack = ack_apic_edge, - .irq_set_affinity = msi_set_affinity, - .irq_retrigger = ioapic_retrigger_irq, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, - unsigned int irq_base, unsigned int irq_offset) -{ - struct irq_chip *chip = &msi_chip; - struct msi_msg msg; - unsigned int irq = irq_base + irq_offset; - int ret; - - ret = msi_compose_msg(dev, irq, &msg, -1); - if (ret < 0) - return ret; - - irq_set_msi_desc_off(irq_base, irq_offset, msidesc); - - /* - * MSI-X message is written per-IRQ, the offset is always 0. - * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. - */ - if (!irq_offset) - pci_write_msi_msg(irq, &msg); - - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - - dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); - - return 0; -} - -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - struct msi_desc *msidesc; - unsigned int irq; - int node, ret; - - /* Multiple MSI vectors only supported with interrupt remapping */ - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - - node = dev_to_node(&dev->dev); - - list_for_each_entry(msidesc, &dev->msi_list, list) { - irq = irq_alloc_hwirq(node); - if (!irq) - return -ENOSPC; - - ret = setup_msi_irq(dev, msidesc, irq, 0); - if (ret < 0) { - irq_free_hwirq(irq); - return ret; - } - - } - return 0; -} - -void native_teardown_msi_irq(unsigned int irq) -{ - irq_free_hwirq(irq); -} - -#ifdef CONFIG_DMAR_TABLE -static int -dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest, irq = data->irq; - struct msi_msg msg; - int ret; - - ret = __ioapic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - -static struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .irq_unmask = dmar_msi_unmask, - .irq_mask = dmar_msi_mask, - .irq_ack = ack_apic_edge, - .irq_set_affinity = dmar_msi_set_affinity, - .irq_retrigger = ioapic_retrigger_irq, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg, -1); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} -#endif - -#ifdef CONFIG_HPET_TIMER - -static int hpet_msi_set_affinity(struct irq_data *data, - const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = data->chip_data; - struct msi_msg msg; - unsigned int dest; - int ret; - - ret = __ioapic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - hpet_msi_read(data->handler_data, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - hpet_msi_write(data->handler_data, &msg); - - return IRQ_SET_MASK_OK_NOCOPY; -} - -static struct irq_chip hpet_msi_type = { - .name = "HPET_MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, - .irq_ack = ack_apic_edge, - .irq_set_affinity = hpet_msi_set_affinity, - .irq_retrigger = ioapic_retrigger_irq, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -int default_setup_hpet_msi(unsigned int irq, unsigned int id) -{ - struct irq_chip *chip = &hpet_msi_type; - struct msi_msg msg; - int ret; - - ret = msi_compose_msg(NULL, irq, &msg, id); - if (ret < 0) - return ret; - - hpet_msi_write(irq_get_handler_data(irq), &msg); - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - setup_remapped_irq(irq, irq_cfg(irq), chip); - - irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); - return 0; -} -#endif - -#endif /* CONFIG_PCI_MSI */ -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static int -ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest; - int ret; - - ret = __ioapic_set_affinity(data, mask, &dest); - if (ret) - return ret; - - target_ht_irq(data->irq, dest, cfg->vector); - return IRQ_SET_MASK_OK_NOCOPY; -} - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .irq_mask = mask_ht_irq, - .irq_unmask = unmask_ht_irq, - .irq_ack = ack_apic_edge, - .irq_set_affinity = ht_set_affinity, - .irq_retrigger = ioapic_retrigger_irq, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg; - struct ht_irq_msg msg; - unsigned dest; - int err; - - if (disable_apic) - return -ENXIO; - - cfg = irq_cfg(irq); - err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (err) - return err; - - err = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus(), &dest); - if (err) - return err; - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - - dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - - return 0; -} -#endif /* CONFIG_HT_IRQ */ - static int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) { @@ -3451,7 +2462,7 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -static int __init io_apic_get_redir_entries(int ioapic) +static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3476,28 +2487,8 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) return ioapic_initialized ? ioapic_dynirq_base : gsi_top; } -int __init arch_probe_nr_irqs(void) -{ - int nr; - - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; - - nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; -#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) - /* - * for MSI and HT dyn irq - */ - nr += gsi_top * 16; -#endif - if (nr < nr_irqs) - nr_irqs = nr; - - return 0; -} - #ifdef CONFIG_X86_32 -static int __init io_apic_get_unique_id(int ioapic, int apic_id) +static int io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; @@ -3572,30 +2563,63 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } -static u8 __init io_apic_unique_id(u8 id) +static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); + return io_apic_get_unique_id(idx, id); else return id; } #else -static u8 __init io_apic_unique_id(u8 id) +static u8 io_apic_unique_id(int idx, u8 id) { - int i; + union IO_APIC_reg_00 reg_00; DECLARE_BITMAP(used, 256); + unsigned long flags; + u8 new_id; + int i; bitmap_zero(used, 256); for_each_ioapic(i) __set_bit(mpc_ioapic_id(i), used); + + /* Hand out the requested id if available */ if (!test_bit(id, used)) return id; - return find_first_zero_bit(used, 256); + + /* + * Read the current id from the ioapic and keep it if + * available. + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + new_id = reg_00.bits.ID; + if (!test_bit(new_id, used)) { + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); + return new_id; + } + + /* + * Get the next free id and write it to the ioapic. + */ + new_id = find_first_zero_bit(used, 256); + reg_00.bits.ID = new_id; + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + /* Sanity check */ + BUG_ON(reg_00.bits.ID != new_id); + + return new_id; } #endif -static int __init io_apic_get_version(int ioapic) +static int io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3702,6 +2726,7 @@ static struct resource * __init ioapic_setup_resources(void) snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; num++; + ioapics[i].iomem_res = res; } ioapic_resources = res; @@ -3799,21 +2824,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) return gsi - gsi_cfg->gsi_base; } -static __init int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", - MAX_IO_APICS, nr_ioapics); - return 1; - } - if (!address) { - pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); - return 1; - } - return 0; -} - -static __init int bad_ioapic_register(int idx) +static int bad_ioapic_register(int idx) { union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; @@ -3832,32 +2843,61 @@ static __init int bad_ioapic_register(int idx) return 0; } -void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, - struct ioapic_domain_cfg *cfg) +static int find_free_ioapic_entry(void) { - int idx = 0; - int entries; + int idx; + + for (idx = 0; idx < MAX_IO_APICS; idx++) + if (ioapics[idx].nr_registers == 0) + return idx; + + return MAX_IO_APICS; +} + +/** + * mp_register_ioapic - Register an IOAPIC device + * @id: hardware IOAPIC ID + * @address: physical address of IOAPIC register area + * @gsi_base: base of GSI associated with the IOAPIC + * @cfg: configuration information for the IOAPIC + */ +int mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg) +{ + bool hotplug = !!ioapic_initialized; struct mp_ioapic_gsi *gsi_cfg; + int idx, ioapic, entries; + u32 gsi_end; - if (bad_ioapic(address)) - return; + if (!address) { + pr_warn("Bogus (zero) I/O APIC address found, skipping!\n"); + return -EINVAL; + } + for_each_ioapic(ioapic) + if (ioapics[ioapic].mp_config.apicaddr == address) { + pr_warn("address 0x%x conflicts with IOAPIC%d\n", + address, ioapic); + return -EEXIST; + } - idx = nr_ioapics; + idx = find_free_ioapic_entry(); + if (idx >= MAX_IO_APICS) { + pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, idx); + return -ENOSPC; + } ioapics[idx].mp_config.type = MP_IOAPIC; ioapics[idx].mp_config.flags = MPC_APIC_USABLE; ioapics[idx].mp_config.apicaddr = address; - ioapics[idx].irqdomain = NULL; - ioapics[idx].irqdomain_cfg = *cfg; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - if (bad_ioapic_register(idx)) { clear_fixmap(FIX_IO_APIC_BASE_0 + idx); - return; + return -ENODEV; } - ioapics[idx].mp_config.apicid = io_apic_unique_id(id); + ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id); ioapics[idx].mp_config.apicver = io_apic_get_version(idx); /* @@ -3865,24 +2905,112 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ entries = io_apic_get_redir_entries(idx); + gsi_end = gsi_base + entries - 1; + for_each_ioapic(ioapic) { + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if ((gsi_base >= gsi_cfg->gsi_base && + gsi_base <= gsi_cfg->gsi_end) || + (gsi_end >= gsi_cfg->gsi_base && + gsi_end <= gsi_cfg->gsi_end)) { + pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n", + gsi_base, gsi_end, + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENOSPC; + } + } gsi_cfg = mp_ioapic_gsi_routing(idx); gsi_cfg->gsi_base = gsi_base; - gsi_cfg->gsi_end = gsi_base + entries - 1; + gsi_cfg->gsi_end = gsi_end; + + ioapics[idx].irqdomain = NULL; + ioapics[idx].irqdomain_cfg = *cfg; /* - * The number of IO-APIC IRQ registers (== #pins): + * If mp_register_ioapic() is called during early boot stage when + * walking ACPI/SFI/DT tables, it's too early to create irqdomain, + * we are still using bootmem allocator. So delay it to setup_IO_APIC(). */ - ioapics[idx].nr_registers = entries; + if (hotplug) { + if (mp_irqdomain_create(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return -ENOMEM; + } + alloc_ioapic_saved_registers(idx); + } if (gsi_cfg->gsi_end >= gsi_top) gsi_top = gsi_cfg->gsi_end + 1; + if (nr_ioapics <= idx) + nr_ioapics = idx + 1; + + /* Set nr_registers to mark entry present */ + ioapics[idx].nr_registers = entries; pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), gsi_cfg->gsi_base, gsi_cfg->gsi_end); - nr_ioapics++; + return 0; +} + +int mp_unregister_ioapic(u32 gsi_base) +{ + int ioapic, pin; + int found = 0; + struct mp_pin_info *pin_info; + + for_each_ioapic(ioapic) + if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { + found = 1; + break; + } + if (!found) { + pr_warn("can't find IOAPIC for GSI %d\n", gsi_base); + return -ENODEV; + } + + for_each_pin(ioapic, pin) { + pin_info = mp_pin_info(ioapic, pin); + if (pin_info->count) { + pr_warn("pin%d on IOAPIC%d is still in use.\n", + pin, ioapic); + return -EBUSY; + } + } + + /* Mark entry not present */ + ioapics[ioapic].nr_registers = 0; + ioapic_destroy_irqdomain(ioapic); + free_ioapic_saved_registers(ioapic); + if (ioapics[ioapic].iomem_res) + release_resource(ioapics[ioapic].iomem_res); + clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic); + memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic])); + + return 0; +} + +int mp_ioapic_registered(u32 gsi_base) +{ + int ioapic; + + for_each_ioapic(ioapic) + if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) + return 1; + + return 0; +} + +static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, + int ioapic, int ioapic_pin, + int trigger, int polarity) +{ + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; } int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, @@ -3931,7 +3059,7 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq) ioapic_mask_entry(ioapic, pin); __remove_pin_from_irq(cfg, ioapic, pin); - WARN_ON(cfg->irq_2_pin != NULL); + WARN_ON(!list_empty(&cfg->irq_2_pin)); arch_teardown_hwirq(virq); } @@ -3964,18 +3092,6 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node) return ret; } -bool mp_should_keep_irq(struct device *dev) -{ - if (dev->power.is_prepared) - return true; -#ifdef CONFIG_PM_RUNTIME - if (dev->power.runtime_status == RPM_SUSPENDING) - return true; -#endif - - return false; -} - /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c new file mode 100644 index 000000000000..d6ba2d660dc5 --- /dev/null +++ b/arch/x86/kernel/apic/msi.c @@ -0,0 +1,286 @@ +/* + * Support of MSI, HPET and DMAR interrupts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/dmar.h> +#include <linux/hpet.h> +#include <linux/msi.h> +#include <asm/msidef.h> +#include <asm/hpet.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/irq_remapping.h> + +void native_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg = irq_cfg(irq); + + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL : + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU : + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED : + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); +} + +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; + + x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); + + return 0; +} + +static int +msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = irqd_cfg(data); + struct msi_msg msg; + unsigned int dest; + int ret; + + ret = apic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + __get_cached_msi_msg(data->msi_desc, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + __pci_write_msi_msg(data->msi_desc, &msg); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_ack = apic_ack_edge, + .irq_set_affinity = msi_set_affinity, + .irq_retrigger = apic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, + unsigned int irq_base, unsigned int irq_offset) +{ + struct irq_chip *chip = &msi_chip; + struct msi_msg msg; + unsigned int irq = irq_base + irq_offset; + int ret; + + ret = msi_compose_msg(dev, irq, &msg, -1); + if (ret < 0) + return ret; + + irq_set_msi_desc_off(irq_base, irq_offset, msidesc); + + /* + * MSI-X message is written per-IRQ, the offset is always 0. + * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. + */ + if (!irq_offset) + pci_write_msi_msg(irq, &msg); + + setup_remapped_irq(irq, irq_cfg(irq), chip); + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); + + dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq); + + return 0; +} + +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + struct msi_desc *msidesc; + unsigned int irq; + int node, ret; + + /* Multiple MSI vectors only supported with interrupt remapping */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + + node = dev_to_node(&dev->dev); + + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = irq_alloc_hwirq(node); + if (!irq) + return -ENOSPC; + + ret = setup_msi_irq(dev, msidesc, irq, 0); + if (ret < 0) { + irq_free_hwirq(irq); + return ret; + } + + } + return 0; +} + +void native_teardown_msi_irq(unsigned int irq) +{ + irq_free_hwirq(irq); +} + +#ifdef CONFIG_DMAR_TABLE +static int +dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_cfg *cfg = irqd_cfg(data); + unsigned int dest, irq = data->irq; + struct msi_msg msg; + int ret; + + ret = apic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = apic_ack_edge, + .irq_set_affinity = dmar_msi_set_affinity, + .irq_retrigger = apic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg, -1); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +/* + * MSI message composition + */ +#ifdef CONFIG_HPET_TIMER + +static int hpet_msi_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = irqd_cfg(data); + struct msi_msg msg; + unsigned int dest; + int ret; + + ret = apic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + hpet_msi_read(data->handler_data, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(data->handler_data, &msg); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = apic_ack_edge, + .irq_set_affinity = hpet_msi_set_affinity, + .irq_retrigger = apic_retrigger_irq, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +int default_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + struct irq_chip *chip = &hpet_msi_type; + struct msi_msg msg; + int ret; + + ret = msi_compose_msg(NULL, irq, &msg, id); + if (ret < 0) + return ret; + + hpet_msi_write(irq_get_handler_data(irq), &msg); + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + setup_remapped_irq(irq, irq_cfg(irq), chip); + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); + return 0; +} +#endif diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c new file mode 100644 index 000000000000..6cedd7914581 --- /dev/null +++ b/arch/x86/kernel/apic/vector.c @@ -0,0 +1,719 @@ +/* + * Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * Moved from arch/x86/kernel/apic/io_apic.c. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/compiler.h> +#include <linux/irqdomain.h> +#include <linux/slab.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/i8259.h> +#include <asm/desc.h> +#include <asm/irq_remapping.h> + +static DEFINE_RAW_SPINLOCK(vector_lock); + +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + raw_spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + raw_spin_unlock(&vector_lock); +} + +struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq_get_chip_data(irq); +} + +struct irq_cfg *irqd_cfg(struct irq_data *irq_data) +{ + return irq_data->chip_data; +} + +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +{ + struct irq_cfg *cfg; + + cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); + if (!cfg) + return NULL; + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) + goto out_cfg; + if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + goto out_domain; +#ifdef CONFIG_X86_IO_APIC + INIT_LIST_HEAD(&cfg->irq_2_pin); +#endif + return cfg; +out_domain: + free_cpumask_var(cfg->domain); +out_cfg: + kfree(cfg); + return NULL; +} + +struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) +{ + int res = irq_alloc_desc_at(at, node); + struct irq_cfg *cfg; + + if (res < 0) { + if (res != -EEXIST) + return NULL; + cfg = irq_cfg(at); + if (cfg) + return cfg; + } + + cfg = alloc_irq_cfg(at, node); + if (cfg) + irq_set_chip_data(at, cfg); + else + irq_free_desc(at); + return cfg; +} + +static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) +{ + if (!cfg) + return; + irq_set_chip_data(at, NULL); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); +} + +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 16; + int cpu, err; + cpumask_var_t tmp_mask; + + if (cfg->move_in_progress) + return -EBUSY; + + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + cpumask_clear(cfg->old_domain); + cpu = cpumask_first_and(mask, cpu_online_mask); + while (cpu < nr_cpu_ids) { + int new_cpu, vector, offset; + + apic->vector_allocation_domain(cpu, tmp_mask, mask); + + if (cpumask_subset(tmp_mask, cfg->domain)) { + err = 0; + if (cpumask_equal(tmp_mask, cfg->domain)) + break; + /* + * New cpumask using the vector is a proper subset of + * the current in use mask. So cleanup the vector + * allocation for the members that are not used anymore. + */ + cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); + cpumask_and(cfg->domain, cfg->domain, tmp_mask); + break; + } + + vector = current_vector; + offset = current_offset; +next: + vector += 16; + if (vector >= first_system_vector) { + offset = (offset + 1) % 16; + vector = FIRST_EXTERNAL_VECTOR + offset; + } + + if (unlikely(current_vector == vector)) { + cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + continue; + } + + if (test_bit(vector, used_vectors)) + goto next; + + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + if (per_cpu(vector_irq, new_cpu)[vector] > + VECTOR_UNDEFINED) + goto next; + } + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (cfg->vector) { + cpumask_copy(cfg->old_domain, cfg->domain); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); + } + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; + } + free_cpumask_var(tmp_mask); + + return err; +} + +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + int err; + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, cfg, mask); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return err; +} + +void clear_irq_vector(int irq, struct irq_cfg *cfg) +{ + int cpu, vector; + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + BUG_ON(!cfg->vector); + + vector = cfg->vector; + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + + cfg->vector = 0; + cpumask_clear(cfg->domain); + + if (likely(!cfg->move_in_progress)) { + raw_spin_unlock_irqrestore(&vector_lock, flags); + return; + } + + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; + vector++) { + if (per_cpu(vector_irq, cpu)[vector] != irq) + continue; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + break; + } + } + cfg->move_in_progress = 0; + raw_spin_unlock_irqrestore(&vector_lock, flags); +} + +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + if (gsi_top <= NR_IRQS_LEGACY) + nr += 8 * nr_cpu_ids; + else + nr += gsi_top * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + return nr_legacy_irqs(); +} + +int __init arch_early_irq_init(void) +{ + return arch_early_ioapic_init(); +} + +static void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + int irq, vector; + struct irq_cfg *cfg; + + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + raw_spin_lock(&vector_lock); + /* Mark the inuse vectors */ + for_each_active_irq(irq) { + cfg = irq_cfg(irq); + if (!cfg) + continue; + + if (!cpumask_test_cpu(cpu, cfg->domain)) + continue; + vector = cfg->vector; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq <= VECTOR_UNDEFINED) + continue; + + cfg = irq_cfg(irq); + if (!cpumask_test_cpu(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + } + raw_spin_unlock(&vector_lock); +} + +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < nr_legacy_irqs(); irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; + + __setup_vector_irq(cpu); +} + +int apic_retrigger_irq(struct irq_data *data) +{ + struct irq_cfg *cfg = irqd_cfg(data); + unsigned long flags; + int cpu; + + raw_spin_lock_irqsave(&vector_lock, flags); + cpu = cpumask_first_and(cfg->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} + +void apic_ack_edge(struct irq_data *data) +{ + irq_complete_move(irqd_cfg(data)); + irq_move_irq(data); + ack_APIC_irq(); +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = irqd_cfg(data); + unsigned int irq = data->irq; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -EPERM; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + cpumask_copy(data->affinity, mask); + + return 0; +} + +#ifdef CONFIG_SMP +void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), + IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + + ack_APIC_irq(); + irq_enter(); + exit_idle(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + int irq; + unsigned int irr; + struct irq_desc *desc; + struct irq_cfg *cfg; + + irq = __this_cpu_read(vector_irq[vector]); + + if (irq <= VECTOR_UNDEFINED) + continue; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + if (!cfg) + continue; + + raw_spin_lock(&desc->lock); + + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + goto unlock; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + /* + * Check if the vector that needs to be cleanedup is + * registered at the cpu's IRR. If so, then this is not + * the best time to clean it up. Lets clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to myself. + */ + if (irr & (1 << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + goto unlock; + } + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); +unlock: + raw_spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +{ + unsigned me; + + if (likely(!cfg->move_in_progress)) + return; + + me = smp_processor_id(); + + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); +} + +void irq_complete_move(struct irq_cfg *cfg) +{ + __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + + if (!cfg) + return; + + __irq_complete_move(cfg, cfg->vector); +} +#endif + +/* + * Dynamic irq allocate and deallocation. Should be replaced by irq domains! + */ +int arch_setup_hwirq(unsigned int irq, int node) +{ + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + cfg = alloc_irq_cfg(irq, node); + if (!cfg) + return -ENOMEM; + + raw_spin_lock_irqsave(&vector_lock, flags); + ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + if (!ret) + irq_set_chip_data(irq, cfg); + else + free_irq_cfg(irq, cfg); + return ret; +} + +void arch_teardown_hwirq(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + + free_remapped_irq(irq); + clear_irq_vector(irq, cfg); + free_irq_cfg(irq, cfg); +} + +static void __init print_APIC_field(int base) +{ + int i; + + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + pr_cont("%08x", apic_read(base + i*0x10)); + + pr_cont("\n"); +} + +static void __init print_local_APIC(void *dummy) +{ + unsigned int i, v, ver, maxlvt; + u64 icr; + + pr_debug("printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id()); + v = apic_read(APIC_LVR); + pr_info("... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = lapic_get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + /* !82489DX */ + if (APIC_INTEGRATED(ver)) { + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + pr_debug("... APIC ARBPRI: %08x (%02x)\n", + v, v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + pr_debug("... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + pr_debug("... APIC RRR: %08x\n", v); + } + + v = apic_read(APIC_LDR); + pr_debug("... APIC LDR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + pr_debug("... APIC DFR: %08x\n", v); + } + v = apic_read(APIC_SPIV); + pr_debug("... APIC SPIV: %08x\n", v); + + pr_debug("... APIC ISR field:\n"); + print_APIC_field(APIC_ISR); + pr_debug("... APIC TMR field:\n"); + print_APIC_field(APIC_TMR); + pr_debug("... APIC IRR field:\n"); + print_APIC_field(APIC_IRR); + + /* !82489DX */ + if (APIC_INTEGRATED(ver)) { + /* Due to the Pentium erratum 3AP. */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + pr_debug("... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + pr_debug("... APIC ICR: %08x\n", (u32)icr); + pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32)); + + v = apic_read(APIC_LVTT); + pr_debug("... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { + /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + pr_debug("... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + pr_debug("... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + pr_debug("... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { + /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + pr_debug("... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + pr_debug("... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + pr_debug("... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + pr_debug("... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + pr_debug("... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + pr_debug("... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + pr_debug("... APIC EILVT%d: %08x\n", i, v); + } + } + pr_cont("\n"); +} + +static void __init print_local_APICs(int maxcpu) +{ + int cpu; + + if (!maxcpu) + return; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } + preempt_enable(); +} + +static void __init print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (!nr_legacy_irqs()) + return; + + pr_debug("\nprinting PIC contents\n"); + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + pr_debug("... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + pr_debug("... PIC IRR: %04x\n", v); + + outb(0x0b, 0xa0); + outb(0x0b, 0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a, 0xa0); + outb(0x0a, 0x20); + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + pr_debug("... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + pr_debug("... PIC ELCR: %04x\n", v); +} + +static int show_lapic __initdata = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +static int __init print_ICs(void) +{ + if (apic_verbosity == APIC_QUIET) + return 0; + + print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic && !apic_from_smp_config()) + return 0; + + print_local_APICs(show_lapic); + print_IO_APICs(); + + return 0; +} + +late_initcall(print_ICs); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 584874451414..927ec9235947 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -378,7 +378,6 @@ static struct cpuidle_driver apm_idle_driver = { { /* entry 1 is for APM idle */ .name = "APM", .desc = "APM idle", - .flags = CPUIDLE_FLAG_TIME_VALID, .exit_latency = 250, /* WAG */ .target_residency = 500, /* WAG */ .enter = &apm_cpu_idle diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index d67c4be3e8b1..3b3b9d33ac1d 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ucontext.h> #include <linux/lguest.h> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4f9359f36bb7..fdcbb4d27c9f 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,3 +1,7 @@ +#ifndef __LINUX_KBUILD_H +# error "Please do not build this file directly, build asm-offsets.c instead" +#endif + #include <asm/ia32.h> #define __SYSCALL_64(nr, sym, compat) [nr] = 1, diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index 06d3e5a14d9d..f3672508b249 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -50,6 +50,7 @@ int audit_classify_syscall(int abi, unsigned syscall) case __NR_openat: return 3; case __NR_execve: + case __NR_execveat: return 5; default: return 0; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e27b49d7c922..80091ae54c2b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -66,3 +66,4 @@ targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) endif +clean-files += capflags.c diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index e2b22df964cd..36d99a337b49 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -28,7 +28,7 @@ function dump_array() # If the /* comment */ starts with a quote string, grab that. VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" [ -z "$VALUE" ] && VALUE="\"$NAME\"" - [ "$VALUE" == '""' ] && continue + [ "$VALUE" = '""' ] && continue # Name is uppercase, VALUE is all lowercase VALUE="$(echo "$VALUE" | tr A-Z a-z)" diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c index 639d1289b1ba..97242a9242bd 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -130,10 +130,7 @@ static ssize_t _iommu_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask); } static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 30790d798e6b..cc6cedb8f25d 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -219,7 +219,6 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n; cpumask_t *active_mask; struct pmu *pmu = dev_get_drvdata(dev); @@ -230,10 +229,7 @@ static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, else return 0; - n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, active_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 3c895d480cd7..073983398364 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -568,8 +568,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { }; struct event_constraint intel_slm_pebs_event_constraints[] = { - /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */ - INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf), + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1), /* Allow all events as PEBS with no flags */ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1), EVENT_CONSTRAINT_END diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index d64f275fe274..6e434f8e5fc8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -103,6 +103,13 @@ static struct kobj_attribute format_attr_##_var = \ #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ +#define RAPL_EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + struct rapl_pmu { spinlock_t lock; int hw_unit; /* 1/2^hw_unit Joule */ @@ -365,11 +372,7 @@ static void rapl_pmu_event_read(struct perf_event *event) static ssize_t rapl_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); @@ -383,23 +386,36 @@ static struct attribute_group rapl_pmu_attr_group = { .attrs = rapl_pmu_attrs, }; -EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); -EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); -EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); -EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); +static ssize_t rapl_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); + + return 0; +} + +RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); -EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); -EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); -EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); -EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); /* * we compute in 0.23 nJ increments regardless of MSR */ -EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); -EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); -EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); -EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); static struct attribute *rapl_events_srv_attr[] = { EVENT_PTR(rapl_cores), diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 9762dbd9f3f7..10b8d3eaaf15 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -276,6 +276,17 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, return box; } +/* + * Using uncore_pmu_event_init pmu event_init callback + * as a detection point for uncore events. + */ +static int uncore_pmu_event_init(struct perf_event *event); + +static bool is_uncore_event(struct perf_event *event) +{ + return event->pmu->event_init == uncore_pmu_event_init; +} + static int uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) { @@ -290,13 +301,18 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, b return -EINVAL; n = box->n_events; - box->event_list[n] = leader; - n++; + + if (is_uncore_event(leader)) { + box->event_list[n] = leader; + n++; + } + if (!dogrp) return n; list_for_each_entry(event, &leader->sibling_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) + if (!is_uncore_event(event) || + event->state <= PERF_EVENT_STATE_OFF) continue; if (n >= max_count) @@ -647,11 +663,7 @@ static int uncore_pmu_event_init(struct perf_event *event) static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { - int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); - - buf[n++] = '\n'; - buf[n] = '\0'; - return n; + return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask); } static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 18eb78bbdd10..863d9b02563e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -17,7 +17,7 @@ #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 2 +#define UNCORE_EXTRA_PCI_DEV_MAX 3 /* support up to 8 sockets */ #define UNCORE_SOCKET_MAX 8 diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index 745b158e9a65..21af6149edf2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -891,6 +891,7 @@ void snbep_uncore_cpu_init(void) enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, + HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2026,6 +2027,17 @@ void hswep_uncore_cpu_init(void) { if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + + /* Detect 6-8 core systems with only two SBOXes */ + if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) { + u32 capid4; + + pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3], + 0x94, &capid4); + if (((capid4 >> 6) & 0x3) == 0) + hswep_uncore_sbox.num_boxes = 2; + } + uncore_msr_uncores = hswep_msr_uncores; } @@ -2287,6 +2299,11 @@ static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, + { /* PCU.3 (for Capability registers) */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + HSWEP_PCI_PCU_3), + }, { /* end: all zeroes */ } }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4a8013d55947..60639093d536 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,11 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 }, + { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 }, + { X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 }, + { X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 }, + { X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index f5ab56d14287..aceb2f90c716 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -28,6 +28,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/hpet.h> #include <linux/kdebug.h> #include <asm/cpu.h> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 49f886481615..dd2f07ae9d0c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1114,8 +1114,8 @@ void __init memblock_find_dma_reserve(void) * at first, and assume boot_mem will not take below MAX_DMA_PFN */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { - start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); - end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + start_pfn = min(start_pfn, MAX_DMA_PFN); + end_pfn = min(end_pfn, MAX_DMA_PFN); nr_pages += end_pfn - start_pfn; } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2e1a6853e00c..fe9f0b79a18b 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -455,6 +455,23 @@ struct intel_stolen_funcs { u32 (*base)(int num, int slot, int func, size_t size); }; +static size_t __init gen9_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; + gmch_ctrl &= BDW_GMCH_GMS_MASK; + + if (gmch_ctrl < 0xf0) + return gmch_ctrl << 25; /* 32 MB units */ + else + /* 4MB increments starting at 0xf0 for 4MB */ + return (gmch_ctrl - 0xf0 + 1) << 22; +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { .base = i830_stolen_base, .size = i830_stolen_size, @@ -490,6 +507,11 @@ static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { .size = gen8_stolen_size, }; +static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen9_stolen_size, +}; + static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { .base = intel_stolen_base, .size = chv_stolen_size, @@ -523,6 +545,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = { INTEL_BDW_M_IDS(&gen8_stolen_funcs), INTEL_BDW_D_IDS(&gen8_stolen_funcs), INTEL_CHV_IDS(&chv_stolen_funcs), + INTEL_SKL_IDS(&gen9_stolen_funcs), }; static void __init intel_graphics_stolen(int num, int slot, int func) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 344b63f18d14..000d4199b03e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -732,10 +732,10 @@ ENTRY(interrupt) ENTRY(irq_entries_start) RING0_INT_FRAME vector=FIRST_EXTERNAL_VECTOR -.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 +.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 .balign 32 .rept 7 - .if vector < NR_VECTORS + .if vector < FIRST_SYSTEM_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif @@ -1191,10 +1191,10 @@ ENTRY(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %edx - lea 0x4(%ebp), %eax + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %edx + subl $MCOUNT_INSN_SIZE, %eax call prepare_ftrace_return popl %edx popl %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab54106..9ebaf63ba182 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -652,6 +652,20 @@ ENTRY(stub_execve) CFI_ENDPROC END(stub_execve) +ENTRY(stub_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execveat) + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -697,6 +711,20 @@ ENTRY(stub_x32_execve) CFI_ENDPROC END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execveat + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execveat) + #endif /* @@ -712,10 +740,10 @@ ENTRY(interrupt) ENTRY(irq_entries_start) INTR_FRAME vector=FIRST_EXTERNAL_VECTOR -.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 +.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 .balign 32 .rept 7 - .if vector < NR_VECTORS + .if vector < FIRST_SYSTEM_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -8 .endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3386dc9aa333..8b7b0a51e742 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -17,6 +17,7 @@ #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/list.h> #include <linux/module.h> @@ -47,7 +48,7 @@ int ftrace_arch_code_modify_post_process(void) union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; struct { - char e8; + unsigned char e8; int offset; } __attribute__((packed)); }; @@ -582,7 +583,7 @@ void ftrace_replace_code(int enable) remove_breakpoints: pr_warn("Failed on %s (%d):\n", report, count); - ftrace_bug(ret, rec ? rec->ip : 0); + ftrace_bug(ret, rec); for_ftrace_rec_iter(iter) { rec = ftrace_rec_iter_record(iter); /* @@ -644,13 +645,8 @@ int __init ftrace_dyn_arch_init(void) { return 0; } -#endif - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - -#ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); +#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER) static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -664,6 +660,280 @@ static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) */ return calc.code; } +#endif + +/* Currently only x86_64 supports dynamic trampolines */ +#ifdef CONFIG_X86_64 + +#ifdef CONFIG_MODULES +#include <linux/moduleloader.h> +/* Module allocation simplifies allocating memory for code */ +static inline void *alloc_tramp(unsigned long size) +{ + return module_alloc(size); +} +static inline void tramp_free(void *tramp) +{ + module_memfree(tramp); +} +#else +/* Trampolines can only be created if modules are supported */ +static inline void *alloc_tramp(unsigned long size) +{ + return NULL; +} +static inline void tramp_free(void *tramp) { } +#endif + +/* Defined as markers to the end of the ftrace default trampolines */ +extern void ftrace_caller_end(void); +extern void ftrace_regs_caller_end(void); +extern void ftrace_return(void); +extern void ftrace_caller_op_ptr(void); +extern void ftrace_regs_caller_op_ptr(void); + +/* movq function_trace_op(%rip), %rdx */ +/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ +#define OP_REF_SIZE 7 + +/* + * The ftrace_ops is passed to the function callback. Since the + * trampoline only services a single ftrace_ops, we can pass in + * that ops directly. + * + * The ftrace_op_code_union is used to create a pointer to the + * ftrace_ops that will be passed to the callback function. + */ +union ftrace_op_code_union { + char code[OP_REF_SIZE]; + struct { + char op[3]; + int offset; + } __attribute__((packed)); +}; + +static unsigned long +create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) +{ + unsigned const char *jmp; + unsigned long start_offset; + unsigned long end_offset; + unsigned long op_offset; + unsigned long offset; + unsigned long size; + unsigned long ip; + unsigned long *ptr; + void *trampoline; + /* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */ + unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; + union ftrace_op_code_union op_ptr; + int ret; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + op_offset = (unsigned long)ftrace_regs_caller_op_ptr; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_caller_end; + op_offset = (unsigned long)ftrace_caller_op_ptr; + } + + size = end_offset - start_offset; + + /* + * Allocate enough size to store the ftrace_caller code, + * the jmp to ftrace_return, as well as the address of + * the ftrace_ops this trampoline is used for. + */ + trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *)); + if (!trampoline) + return 0; + + *tramp_size = size + MCOUNT_INSN_SIZE + sizeof(void *); + + /* Copy ftrace_caller onto the trampoline memory */ + ret = probe_kernel_read(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) { + tramp_free(trampoline); + return 0; + } + + ip = (unsigned long)trampoline + size; + + /* The trampoline ends with a jmp to ftrace_return */ + jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return); + memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE); + + /* + * The address of the ftrace_ops that is used for this trampoline + * is stored at the end of the trampoline. This will be used to + * load the third parameter for the callback. Basically, that + * location at the end of the trampoline takes the place of + * the global function_trace_op variable. + */ + + ptr = (unsigned long *)(trampoline + size + MCOUNT_INSN_SIZE); + *ptr = (unsigned long)ops; + + op_offset -= start_offset; + memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); + + /* Are we pointing to the reference? */ + if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) { + tramp_free(trampoline); + return 0; + } + + /* Load the contents of ptr into the callback parameter */ + offset = (unsigned long)ptr; + offset -= (unsigned long)trampoline + op_offset + OP_REF_SIZE; + + op_ptr.offset = offset; + + /* put in the new offset to the ftrace_ops */ + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + + /* ALLOC_TRAMP flags lets us know we created it */ + ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + + return (unsigned long)trampoline; +} + +static unsigned long calc_trampoline_call_offset(bool save_regs) +{ + unsigned long start_offset; + unsigned long call_offset; + + if (save_regs) { + start_offset = (unsigned long)ftrace_regs_caller; + call_offset = (unsigned long)ftrace_regs_call; + } else { + start_offset = (unsigned long)ftrace_caller; + call_offset = (unsigned long)ftrace_call; + } + + return call_offset - start_offset; +} + +void arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ + ftrace_func_t func; + unsigned char *new; + unsigned long offset; + unsigned long ip; + unsigned int size; + int ret; + + if (ops->trampoline) { + /* + * The ftrace_ops caller may set up its own trampoline. + * In such a case, this code must not modify it. + */ + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + } else { + ops->trampoline = create_trampoline(ops, &size); + if (!ops->trampoline) + return; + ops->trampoline_size = size; + } + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + ip = ops->trampoline + offset; + + func = ftrace_ops_get_func(ops); + + /* Do a safe modify in case the trampoline is executing */ + new = ftrace_call_replace(ip, (unsigned long)func); + ret = update_ftrace_func(ip, new); + + /* The update should never fail */ + WARN_ON(ret); +} + +/* Return the address of the function the trampoline calls */ +static void *addr_from_call(void *ptr) +{ + union ftrace_code_union calc; + int ret; + + ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE); + if (WARN_ON_ONCE(ret < 0)) + return NULL; + + /* Make sure this is a call */ + if (WARN_ON_ONCE(calc.e8 != 0xe8)) { + pr_warn("Expected e8, got %x\n", calc.e8); + return NULL; + } + + return ptr + MCOUNT_INSN_SIZE + calc.offset; +} + +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, + unsigned long frame_pointer); + +/* + * If the ops->trampoline was not allocated, then it probably + * has a static trampoline func, or is the ftrace caller itself. + */ +static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + bool save_regs = rec->flags & FTRACE_FL_REGS_EN; + void *ptr; + + if (ops && ops->trampoline) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* + * We only know about function graph tracer setting as static + * trampoline. + */ + if (ops->trampoline == FTRACE_GRAPH_ADDR) + return (void *)prepare_ftrace_return; +#endif + return NULL; + } + + offset = calc_trampoline_call_offset(save_regs); + + if (save_regs) + ptr = (void *)FTRACE_REGS_ADDR + offset; + else + ptr = (void *)FTRACE_ADDR + offset; + + return addr_from_call(ptr); +} + +void *arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + unsigned long offset; + + /* If we didn't allocate this trampoline, consider it static */ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return static_tramp_func(ops, rec); + + offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS); + return addr_from_call((void *)ops->trampoline + offset); +} + +void arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + return; + + tramp_free((void *)ops->trampoline); + ops->trampoline = 0; +} + +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); static int ftrace_mod_jmp(unsigned long ip, void *func) { @@ -694,7 +964,7 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, +void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, unsigned long frame_pointer) { unsigned long old; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 4de73ee78361..70e181ea1eac 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -99,32 +99,9 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } -/* - * Setup the vector to irq mappings. - */ -void setup_vector_irq(int cpu) -{ -#ifndef CONFIG_X86_IO_APIC - int irq; - - /* - * On most of the platforms, legacy PIC delivers the interrupts on the - * boot cpu. But there are certain platforms where PIC interrupts are - * delivered to multiple cpu's. If the legacy IRQ is handled by the - * legacy PIC, for the new cpu that is coming online, setup the static - * legacy vector to irq mapping: - */ - for (irq = 0; irq < nr_legacy_irqs(); irq++) - per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; -#endif - - __setup_vector_irq(cpu); -} - static void __init smp_intr_init(void) { #ifdef CONFIG_SMP -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -144,7 +121,6 @@ static void __init smp_intr_init(void) /* IPI used for rebooting/stopping */ alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); -#endif #endif /* CONFIG_SMP */ } @@ -159,7 +135,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) +#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -197,10 +173,17 @@ void __init native_init_IRQ(void) * 'special' SMP interrupts) */ i = FIRST_EXTERNAL_VECTOR; - for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { +#ifndef CONFIG_X86_LOCAL_APIC +#define first_system_vector NR_VECTORS +#endif + for_each_clear_bit_from(i, used_vectors, first_system_vector) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } +#ifdef CONFIG_X86_LOCAL_APIC + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) + set_intr_gate(i, spurious_interrupt); +#endif if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index f7e3cd50ece0..98f654d466e5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1020,6 +1020,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_IF; trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); + + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer to get messed up. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); return 1; } NOKPROBE_SYMBOL(setjmp_pre_handler); @@ -1048,24 +1057,25 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); u8 *addr = (u8 *) (regs->ip - 1); struct jprobe *jp = container_of(p, struct jprobe, kp); + void *saved_sp = kcb->jprobe_saved_sp; if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != kcb->jprobe_saved_sp) { + if (stack_addr(regs) != saved_sp) { struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; printk(KERN_ERR "current sp %p does not match saved sp %p\n", - stack_addr(regs), kcb->jprobe_saved_sp); + stack_addr(regs), saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); show_regs(regs); BUG(); } + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), - kcb->jprobes_stack, - MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); preempt_enable_no_resched(); return 1; } diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 717b02a22e67..5f8f0b3cc674 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -27,7 +27,7 @@ static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, unsigned long orig_ip) { /* * Emulate singlestep (and also recover regs->ip) @@ -39,6 +39,8 @@ int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, p->post_handler(p, regs, 0); } __this_cpu_write(current_kprobe, NULL); + if (orig_ip) + regs->ip = orig_ip; return 1; } @@ -46,7 +48,7 @@ int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { if (kprobe_ftrace(p)) - return __skip_singlestep(p, regs, kcb); + return __skip_singlestep(p, regs, kcb, 0); else return 0; } @@ -71,13 +73,14 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { + unsigned long orig_ip = regs->ip; /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ regs->ip = ip + sizeof(kprobe_opcode_t); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) - __skip_singlestep(p, regs, kcb); + __skip_singlestep(p, regs, kcb, orig_ip); /* * If pre_handler returns !0, it sets regs->ip and * resets current kprobe. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f6945bef2cd1..94f643484300 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d9156ceecdff..42caaef897c8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -59,13 +59,12 @@ static void kvm_get_wallclock(struct timespec *now) native_write_msr(msr_kvm_wall_clock, low, high); - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; pvclock_read_wallclock(&wall_clock, vcpu_time, now); - preempt_enable(); + put_cpu(); } static int kvm_set_wallclock(const struct timespec *now) @@ -107,11 +106,10 @@ static unsigned long kvm_get_tsc_khz(void) int cpu; unsigned long tsc_khz; - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); src = &hv_clock[cpu].pvti; tsc_khz = pvclock_tsc_khz(src); - preempt_enable(); + put_cpu(); return tsc_khz; } @@ -263,7 +261,6 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) @@ -284,23 +281,22 @@ int __init kvm_setup_vsyscall_timeinfo(void) size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - preempt_disable(); - cpu = smp_processor_id(); + cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; flags = pvclock_read_flags(vcpu_time); if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { - preempt_enable(); + put_cpu(); return 1; } if ((ret = pvclock_init_vsyscall(hv_clock, size))) { - preempt_enable(); + put_cpu(); return ret; } - preempt_enable(); + put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 72e8e310258d..469b23d6acc2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -20,6 +20,7 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> #include <asm/cacheflush.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 485981059a40..415480d3ea84 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -22,6 +22,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> +#include <asm/io_apic.h> #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index c73aecf10d34..94ea120fa21f 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -21,40 +21,159 @@ # define function_hook mcount #endif +/* All cases save the original rbp (8 bytes) */ +#ifdef CONFIG_FRAME_POINTER +# ifdef CC_USING_FENTRY +/* Save parent and function stack frames (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16*2) +# else +/* Save just function stack frame (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16) +# endif +#else +/* No need to save a stack frame */ +# define MCOUNT_FRAME_SIZE 8 +#endif /* CONFIG_FRAME_POINTER */ + +/* Size of stack used to save mcount regs in save_mcount_regs */ +#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) + +/* + * gcc -pg option adds a call to 'mcount' in most functions. + * When -mfentry is used, the call is to 'fentry' and not 'mcount' + * and is done before the function's stack frame is set up. + * They both require a set of regs to be saved before calling + * any C code and restored before returning back to the function. + * + * On boot up, all these calls are converted into nops. When tracing + * is enabled, the call can jump to either ftrace_caller or + * ftrace_regs_caller. Callbacks (tracing functions) that require + * ftrace_regs_caller (like kprobes) need to have pt_regs passed to + * it. For this reason, the size of the pt_regs structure will be + * allocated on the stack and the required mcount registers will + * be saved in the locations that pt_regs has them in. + */ + +/* + * @added: the amount of stack added before calling this + * + * After this is called, the following registers contain: + * + * %rdi - holds the address that called the trampoline + * %rsi - holds the parent function (traced function's return address) + * %rdx - holds the original %rbp + */ +.macro save_mcount_regs added=0 + + /* Always save the original rbp */ + pushq %rbp + +#ifdef CONFIG_FRAME_POINTER + /* + * Stack traces will stop at the ftrace trampoline if the frame pointer + * is not set up properly. If fentry is used, we need to save a frame + * pointer for the parent as well as the function traced, because the + * fentry is called before the stack frame is set up, where as mcount + * is called afterward. + */ +#ifdef CC_USING_FENTRY + /* Save the parent pointer (skip orig rbp and our return address) */ + pushq \added+8*2(%rsp) + pushq %rbp + movq %rsp, %rbp + /* Save the return address (now skip orig rbp, rbp and parent) */ + pushq \added+8*3(%rsp) +#else + /* Can't assume that rip is before this (unless added was zero) */ + pushq \added+8(%rsp) +#endif + pushq %rbp + movq %rsp, %rbp +#endif /* CONFIG_FRAME_POINTER */ + + /* + * We add enough stack to save all regs. + */ + subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* + * Save the original RBP. Even though the mcount ABI does not + * require this, it helps out callers. + */ + movq MCOUNT_REG_SIZE-8(%rsp), %rdx + movq %rdx, RBP(%rsp) + + /* Copy the parent address into %rsi (second parameter) */ +#ifdef CC_USING_FENTRY + movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi +#else + /* %rdx contains original %rbp */ + movq 8(%rdx), %rsi +#endif + + /* Move RIP to its proper location */ + movq MCOUNT_REG_SIZE+\added(%rsp), %rdi + movq %rdi, RIP(%rsp) + + /* + * Now %rdi (the first parameter) has the return address of + * where ftrace_call returns. But the callbacks expect the + * address of the call itself. + */ + subq $MCOUNT_INSN_SIZE, %rdi + .endm + +.macro restore_mcount_regs + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + + /* ftrace_regs_caller can modify %rbp */ + movq RBP(%rsp), %rbp + + addq $MCOUNT_REG_SIZE, %rsp + + .endm + #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(function_hook) retq END(function_hook) -/* skip is set if stack has been adjusted */ -.macro ftrace_caller_setup skip=0 - MCOUNT_SAVE_FRAME \skip +ENTRY(ftrace_caller) + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs +GLOBAL(ftrace_caller_op_ptr) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* Load ip into the first parameter */ - movq RIP(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - /* Load the parent_ip into the second parameter */ -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif -.endm - -ENTRY(ftrace_caller) - ftrace_caller_setup /* regs go into 4th parameter (but make it NULL) */ movq $0, %rcx GLOBAL(ftrace_call) call ftrace_stub - MCOUNT_RESTORE_FRAME -ftrace_return: + restore_mcount_regs + + /* + * The copied trampoline must call ftrace_return as it + * still may need to call the function graph tracer. + */ +GLOBAL(ftrace_caller_end) + +GLOBAL(ftrace_return) #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -66,11 +185,16 @@ GLOBAL(ftrace_stub) END(ftrace_caller) ENTRY(ftrace_regs_caller) - /* Save the current flags before compare (in SS location)*/ + /* Save the current flags before any operations that can change them */ pushfq - /* skip=8 to skip flags saved in SS */ - ftrace_caller_setup 8 + /* added 8 bytes to save flags */ + save_mcount_regs 8 + /* save_mcount_regs fills in first two parameters */ + +GLOBAL(ftrace_regs_caller_op_ptr) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx /* Save the rest of pt_regs */ movq %r15, R15(%rsp) @@ -79,18 +203,17 @@ ENTRY(ftrace_regs_caller) movq %r12, R12(%rsp) movq %r11, R11(%rsp) movq %r10, R10(%rsp) - movq %rbp, RBP(%rsp) movq %rbx, RBX(%rsp) /* Copy saved flags */ - movq SS(%rsp), %rcx + movq MCOUNT_REG_SIZE(%rsp), %rcx movq %rcx, EFLAGS(%rsp) /* Kernel segments */ movq $__KERNEL_DS, %rcx movq %rcx, SS(%rsp) movq $__KERNEL_CS, %rcx movq %rcx, CS(%rsp) - /* Stack - skipping return address */ - leaq SS+16(%rsp), %rcx + /* Stack - skipping return address and flags */ + leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx movq %rcx, RSP(%rsp) /* regs go into 4th parameter */ @@ -101,11 +224,11 @@ GLOBAL(ftrace_regs_call) /* Copy flags back to SS, to restore them */ movq EFLAGS(%rsp), %rax - movq %rax, SS(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) /* Handlers can change the RIP */ movq RIP(%rsp), %rax - movq %rax, SS+8(%rsp) + movq %rax, MCOUNT_REG_SIZE+8(%rsp) /* restore the rest of pt_regs */ movq R15(%rsp), %r15 @@ -113,19 +236,22 @@ GLOBAL(ftrace_regs_call) movq R13(%rsp), %r13 movq R12(%rsp), %r12 movq R10(%rsp), %r10 - movq RBP(%rsp), %rbp movq RBX(%rsp), %rbx - /* skip=8 to skip flags saved in SS */ - MCOUNT_RESTORE_FRAME 8 + restore_mcount_regs /* Restore flags */ popfq - jmp ftrace_return + /* + * As this jmp to ftrace_return can be a short jump + * it must not be copied into the trampoline. + * The trampoline will add the code to jump + * to the return. + */ +GLOBAL(ftrace_regs_caller_end) - popfq - jmp ftrace_stub + jmp ftrace_return END(ftrace_regs_caller) @@ -136,6 +262,7 @@ ENTRY(function_hook) cmpq $ftrace_stub, ftrace_trace_function jnz trace +fgraph_trace: #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpq $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -148,42 +275,35 @@ GLOBAL(ftrace_stub) retq trace: - MCOUNT_SAVE_FRAME - - movq RIP(%rsp), %rdi -#ifdef CC_USING_FENTRY - movq SS+16(%rsp), %rsi -#else - movq 8(%rbp), %rsi -#endif - subq $MCOUNT_INSN_SIZE, %rdi + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs call *ftrace_trace_function - MCOUNT_RESTORE_FRAME + restore_mcount_regs - jmp ftrace_stub + jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) - MCOUNT_SAVE_FRAME + /* Saves rbp into %rdx and fills first parameter */ + save_mcount_regs #ifdef CC_USING_FENTRY - leaq SS+16(%rsp), %rdi + leaq MCOUNT_REG_SIZE+8(%rsp), %rsi movq $0, %rdx /* No framepointers needed */ #else - leaq 8(%rbp), %rdi - movq (%rbp), %rdx + /* Save address of the return address of traced function */ + leaq 8(%rdx), %rsi + /* ftrace does sanity checks against frame pointers */ + movq (%rdx), %rdx #endif - movq RIP(%rsp), %rsi - subq $MCOUNT_INSN_SIZE, %rsi - call prepare_ftrace_return - MCOUNT_RESTORE_FRAME + restore_mcount_regs retq END(ftrace_graph_caller) diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index e309cc5c276e..781861cc5ee8 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -78,6 +78,14 @@ u64 perf_reg_abi(struct task_struct *task) { return PERF_SAMPLE_REGS_ABI_32; } + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} #else /* CONFIG_X86_64 */ #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ (1ULL << PERF_REG_X86_ES) | \ @@ -102,4 +110,86 @@ u64 perf_reg_abi(struct task_struct *task) else return PERF_SAMPLE_REGS_ABI_64; } + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + struct pt_regs *user_regs = task_pt_regs(current); + + /* + * If we're in an NMI that interrupted task_pt_regs setup, then + * we can't sample user regs at all. This check isn't really + * sufficient, though, as we could be in an NMI inside an interrupt + * that happened during task_pt_regs setup. + */ + if (regs->sp > (unsigned long)&user_regs->r11 && + regs->sp <= (unsigned long)(user_regs + 1)) { + regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; + regs_user->regs = NULL; + return; + } + + /* + * RIP, flags, and the argument registers are usually saved. + * orig_ax is probably okay, too. + */ + regs_user_copy->ip = user_regs->ip; + regs_user_copy->cx = user_regs->cx; + regs_user_copy->dx = user_regs->dx; + regs_user_copy->si = user_regs->si; + regs_user_copy->di = user_regs->di; + regs_user_copy->r8 = user_regs->r8; + regs_user_copy->r9 = user_regs->r9; + regs_user_copy->r10 = user_regs->r10; + regs_user_copy->r11 = user_regs->r11; + regs_user_copy->orig_ax = user_regs->orig_ax; + regs_user_copy->flags = user_regs->flags; + + /* + * Don't even try to report the "rest" regs. + */ + regs_user_copy->bx = -1; + regs_user_copy->bp = -1; + regs_user_copy->r12 = -1; + regs_user_copy->r13 = -1; + regs_user_copy->r14 = -1; + regs_user_copy->r15 = -1; + + /* + * For this to be at all useful, we need a reasonable guess for + * sp and the ABI. Be careful: we're in NMI context, and we're + * considering current to be the current task, so we should + * be careful not to look at any other percpu variables that might + * change during context switches. + */ + if (IS_ENABLED(CONFIG_IA32_EMULATION) && + task_thread_info(current)->status & TS_COMPAT) { + /* Easy case: we're in a compat syscall. */ + regs_user->abi = PERF_SAMPLE_REGS_ABI_32; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; + } else if (user_regs->orig_ax != -1) { + /* + * We're probably in a 64-bit syscall. + * Warning: this code is severely racy. At least it's better + * than just blindly copying user_regs. + */ + regs_user->abi = PERF_SAMPLE_REGS_ABI_64; + regs_user_copy->sp = this_cpu_read(old_rsp); + regs_user_copy->cs = __USER_CS; + regs_user_copy->ss = __USER_DS; + regs_user_copy->cx = -1; /* usually contains garbage */ + } else { + /* We're probably in an interrupt or exception. */ + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; + } + + regs_user->regs = regs_user_copy; +} #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3ed4a68d4013..5a2c02913af3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 17962e667a91..bae6c609888e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -12,6 +12,7 @@ #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7a8f5845e8eb..6d7022c683e3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1084,7 +1084,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { unsigned int i; - preempt_disable(); smp_cpu_index_default(); /* @@ -1102,22 +1101,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); - if (smp_sanity_check(max_cpus) < 0) { pr_info("SMP disabled\n"); disable_smp(); - goto out; + return; } default_setup_apic_routing(); - preempt_disable(); if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } - preempt_enable(); connect_bsp_APIC(); @@ -1151,8 +1147,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); set_mtrr_aps_delayed_init(); -out: - preempt_enable(); } void arch_enable_nonboot_cpus_begin(void) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09e3e3a..4e942f31b1a7 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,37 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -66,6 +97,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +226,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +240,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a9ae20579895..88900e288021 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -331,7 +331,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) break; /* Success, it was handled */ case 1: /* Bound violation. */ info = mpx_generate_siginfo(regs, xsave_buf); - if (PTR_ERR(info)) { + if (IS_ERR(info)) { /* * We failed to decode the MPX instruction. Act as if * the exception was not caused by MPX. diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4c540c4719d8..0de1fae2bdf0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -738,3 +738,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate) return (void *)xsave + xstate_comp_offsets[feature]; } +EXPORT_SYMBOL_GPL(get_xsave_addr); |