diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
-rw-r--r-- | arch/powerpc/platforms/pseries/Kconfig | 3 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/Makefile | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/cmm.c | 29 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/dlpar.c | 558 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_driver.c | 18 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/hotplug-cpu.c | 182 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/hvCall.S | 132 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/hvCall_inst.c | 38 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/lpar.c | 33 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/offline_states.h | 18 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/plpar_wrappers.h | 22 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/reconfig.c | 8 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/rtasd.c | 519 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/scanlog.c | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/smp.c | 19 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/xics.c | 70 |
16 files changed, 1028 insertions, 629 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f0e6f28427bd..27554c807fd5 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -4,6 +4,7 @@ config PPC_PSERIES select MPIC select PPC_I8259 select PPC_RTAS + select PPC_RTAS_DAEMON select RTAS_ERROR_LOGGING select PPC_UDBG_16550 select PPC_NATIVE @@ -59,7 +60,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" - depends on PPC_SMLPAR && !CRASH_DUMP + depends on PPC_SMLPAR default y help Select this option, if you want to enable the kernel interface diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 790c0b872d4f..0ff5174ae4f5 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -7,8 +7,8 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := lpar.o hvCall.o nvram.o reconfig.o \ - setup.o iommu.o ras.o rtasd.o \ - firmware.o power.o + setup.o iommu.o ras.o \ + firmware.o power.o dlpar.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_SCANLOG) += scanlog.o diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 6567439fe78d..bcdcf0ccc8d7 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -229,8 +229,9 @@ static void cmm_get_mpp(void) { int rc; struct hvcall_mpp_data mpp_data; - unsigned long active_pages_target; - signed long page_loan_request; + signed long active_pages_target, page_loan_request, target; + signed long total_pages = totalram_pages + loaned_pages; + signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -238,17 +239,25 @@ static void cmm_get_mpp(void) return; page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE); - loaned_pages_target = page_loan_request + loaned_pages; - if (loaned_pages_target > oom_freed_pages) - loaned_pages_target -= oom_freed_pages; + target = page_loan_request + (signed long)loaned_pages; + + if (target < 0 || total_pages < min_mem_pages) + target = 0; + + if (target > oom_freed_pages) + target -= oom_freed_pages; else - loaned_pages_target = 0; + target = 0; + + active_pages_target = total_pages - target; + + if (min_mem_pages > active_pages_target) + target = total_pages - min_mem_pages; - active_pages_target = totalram_pages + loaned_pages - loaned_pages_target; + if (target < 0) + target = 0; - if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE)) - loaned_pages_target = totalram_pages + loaned_pages - - ((min_mem_mb * 1024 * 1024) / PAGE_SIZE); + loaned_pages_target = target; cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c new file mode 100644 index 000000000000..12df9e8812a9 --- /dev/null +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -0,0 +1,558 @@ +/* + * Support for dynamic reconfiguration for PCI, Memory, and CPU + * Hotplug and Dynamic Logical Partitioning on RPA platforms. + * + * Copyright (C) 2009 Nathan Fontenot + * Copyright (C) 2009 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kref.h> +#include <linux/notifier.h> +#include <linux/proc_fs.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include "offline_states.h" + +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/uaccess.h> +#include <asm/rtas.h> +#include <asm/pSeries_reconfig.h> + +struct cc_workarea { + u32 drc_index; + u32 zero; + u32 name_offset; + u32 prop_length; + u32 prop_offset; +}; + +static void dlpar_free_cc_property(struct property *prop) +{ + kfree(prop->name); + kfree(prop->value); + kfree(prop); +} + +static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) +{ + struct property *prop; + char *name; + char *value; + + prop = kzalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return NULL; + + name = (char *)ccwa + ccwa->name_offset; + prop->name = kstrdup(name, GFP_KERNEL); + + prop->length = ccwa->prop_length; + value = (char *)ccwa + ccwa->prop_offset; + prop->value = kzalloc(prop->length, GFP_KERNEL); + if (!prop->value) { + dlpar_free_cc_property(prop); + return NULL; + } + + memcpy(prop->value, value, prop->length); + return prop; +} + +static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) +{ + struct device_node *dn; + char *name; + + dn = kzalloc(sizeof(*dn), GFP_KERNEL); + if (!dn) + return NULL; + + /* The configure connector reported name does not contain a + * preceeding '/', so we allocate a buffer large enough to + * prepend this to the full_name. + */ + name = (char *)ccwa + ccwa->name_offset; + dn->full_name = kmalloc(strlen(name) + 2, GFP_KERNEL); + if (!dn->full_name) { + kfree(dn); + return NULL; + } + + sprintf(dn->full_name, "/%s", name); + return dn; +} + +static void dlpar_free_one_cc_node(struct device_node *dn) +{ + struct property *prop; + + while (dn->properties) { + prop = dn->properties; + dn->properties = prop->next; + dlpar_free_cc_property(prop); + } + + kfree(dn->full_name); + kfree(dn); +} + +static void dlpar_free_cc_nodes(struct device_node *dn) +{ + if (dn->child) + dlpar_free_cc_nodes(dn->child); + + if (dn->sibling) + dlpar_free_cc_nodes(dn->sibling); + + dlpar_free_one_cc_node(dn); +} + +#define NEXT_SIBLING 1 +#define NEXT_CHILD 2 +#define NEXT_PROPERTY 3 +#define PREV_PARENT 4 +#define MORE_MEMORY 5 +#define CALL_AGAIN -2 +#define ERR_CFG_USE -9003 + +struct device_node *dlpar_configure_connector(u32 drc_index) +{ + struct device_node *dn; + struct device_node *first_dn = NULL; + struct device_node *last_dn = NULL; + struct property *property; + struct property *last_property = NULL; + struct cc_workarea *ccwa; + int cc_token; + int rc; + + cc_token = rtas_token("ibm,configure-connector"); + if (cc_token == RTAS_UNKNOWN_SERVICE) + return NULL; + + spin_lock(&rtas_data_buf_lock); + ccwa = (struct cc_workarea *)&rtas_data_buf[0]; + ccwa->drc_index = drc_index; + ccwa->zero = 0; + + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + while (rc) { + switch (rc) { + case NEXT_SIBLING: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + dn->parent = last_dn->parent; + last_dn->sibling = dn; + last_dn = dn; + break; + + case NEXT_CHILD: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + if (!first_dn) + first_dn = dn; + else { + dn->parent = last_dn; + if (last_dn) + last_dn->child = dn; + } + + last_dn = dn; + break; + + case NEXT_PROPERTY: + property = dlpar_parse_cc_property(ccwa); + if (!property) + goto cc_error; + + if (!last_dn->properties) + last_dn->properties = property; + else + last_property->next = property; + + last_property = property; + break; + + case PREV_PARENT: + last_dn = last_dn->parent; + break; + + case CALL_AGAIN: + break; + + case MORE_MEMORY: + case ERR_CFG_USE: + default: + printk(KERN_ERR "Unexpected Error (%d) " + "returned from configure-connector\n", rc); + goto cc_error; + } + + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + } + + spin_unlock(&rtas_data_buf_lock); + return first_dn; + +cc_error: + if (first_dn) + dlpar_free_cc_nodes(first_dn); + spin_unlock(&rtas_data_buf_lock); + return NULL; +} + +static struct device_node *derive_parent(const char *path) +{ + struct device_node *parent; + char *last_slash; + + last_slash = strrchr(path, '/'); + if (last_slash == path) { + parent = of_find_node_by_path("/"); + } else { + char *parent_path; + int parent_path_len = last_slash - path + 1; + parent_path = kmalloc(parent_path_len, GFP_KERNEL); + if (!parent_path) + return NULL; + + strlcpy(parent_path, path, parent_path_len); + parent = of_find_node_by_path(parent_path); + kfree(parent_path); + } + + return parent; +} + +int dlpar_attach_node(struct device_node *dn) +{ + struct proc_dir_entry *ent; + int rc; + + of_node_set_flag(dn, OF_DYNAMIC); + kref_init(&dn->kref); + dn->parent = derive_parent(dn->full_name); + if (!dn->parent) + return -ENOMEM; + + rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_ADD, dn); + if (rc == NOTIFY_BAD) { + printk(KERN_ERR "Failed to add device node %s\n", + dn->full_name); + return -ENOMEM; /* For now, safe to assume kmalloc failure */ + } + + of_attach_node(dn); + +#ifdef CONFIG_PROC_DEVICETREE + ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde); + if (ent) + proc_device_tree_add_node(dn, ent); +#endif + + of_node_put(dn->parent); + return 0; +} + +int dlpar_detach_node(struct device_node *dn) +{ + struct device_node *parent = dn->parent; + struct property *prop = dn->properties; + +#ifdef CONFIG_PROC_DEVICETREE + while (prop) { + remove_proc_entry(prop->name, dn->pde); + prop = prop->next; + } + + if (dn->pde) + remove_proc_entry(dn->pde->name, parent->pde); +#endif + + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_REMOVE, dn); + of_detach_node(dn); + of_node_put(dn); /* Must decrement the refcount */ + + return 0; +} + +#define DR_ENTITY_SENSE 9003 +#define DR_ENTITY_PRESENT 1 +#define DR_ENTITY_UNUSABLE 2 +#define ALLOCATION_STATE 9003 +#define ALLOC_UNUSABLE 0 +#define ALLOC_USABLE 1 +#define ISOLATION_STATE 9001 +#define ISOLATE 0 +#define UNISOLATE 1 + +int dlpar_acquire_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_UNUSABLE) + return -1; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE); + if (rc) + return rc; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + if (rc) { + rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + return rc; + } + + return 0; +} + +int dlpar_release_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_PRESENT) + return -1; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE); + if (rc) + return rc; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + if (rc) { + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + return rc; + } + + return 0; +} + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + +static DEFINE_MUTEX(pseries_cpu_hotplug_mutex); + +void cpu_hotplug_driver_lock() +{ + mutex_lock(&pseries_cpu_hotplug_mutex); +} + +void cpu_hotplug_driver_unlock() +{ + mutex_unlock(&pseries_cpu_hotplug_mutex); +} + +static int dlpar_online_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + BUG_ON(get_cpu_current_state(cpu) + != CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_up(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to online " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_probe(const char *buf, size_t count) +{ + struct device_node *dn; + unsigned long drc_index; + char *cpu_name; + int rc; + + cpu_hotplug_driver_lock(); + rc = strict_strtoul(buf, 0, &drc_index); + if (rc) { + rc = -EINVAL; + goto out; + } + + dn = dlpar_configure_connector(drc_index); + if (!dn) { + rc = -EINVAL; + goto out; + } + + /* configure-connector reports cpus as living in the base + * directory of the device tree. CPUs actually live in the + * cpus directory so we need to fixup the full_name. + */ + cpu_name = kzalloc(strlen(dn->full_name) + strlen("/cpus") + 1, + GFP_KERNEL); + if (!cpu_name) { + dlpar_free_cc_nodes(dn); + rc = -ENOMEM; + goto out; + } + + sprintf(cpu_name, "/cpus%s", dn->full_name); + kfree(dn->full_name); + dn->full_name = cpu_name; + + rc = dlpar_acquire_drc(drc_index); + if (rc) { + dlpar_free_cc_nodes(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_attach_node(dn); + if (rc) { + dlpar_release_drc(drc_index); + dlpar_free_cc_nodes(dn); + } + + rc = dlpar_online_cpu(dn); +out: + cpu_hotplug_driver_unlock(); + + return rc ? rc : count; +} + +static int dlpar_offline_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + + if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + break; + + if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { + cpu_maps_update_done(); + rc = cpu_down(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + break; + + } + + /* + * The cpu is in CPU_STATE_INACTIVE. + * Upgrade it's state to CPU_STATE_OFFLINE. + */ + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + BUG_ON(plpar_hcall_norets(H_PROD, intserv[i]) + != H_SUCCESS); + __cpu_die(cpu); + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to offline " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_release(const char *buf, size_t count) +{ + struct device_node *dn; + const u32 *drc_index; + int rc; + + dn = of_find_node_by_path(buf); + if (!dn) + return -EINVAL; + + drc_index = of_get_property(dn, "ibm,my-drc-index", NULL); + if (!drc_index) { + of_node_put(dn); + return -EINVAL; + } + + cpu_hotplug_driver_lock(); + rc = dlpar_offline_cpu(dn); + if (rc) { + of_node_put(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_release_drc(*drc_index); + if (rc) { + of_node_put(dn); + goto out; + } + + rc = dlpar_detach_node(dn); + if (rc) { + dlpar_acquire_drc(*drc_index); + goto out; + } + + of_node_put(dn); +out: + cpu_hotplug_driver_unlock(); + return rc ? rc : count; +} + +static int __init pseries_dlpar_init(void) +{ + ppc_md.cpu_probe = dlpar_cpu_probe; + ppc_md.cpu_release = dlpar_cpu_release; + + return 0; +} +machine_device_initcall(pseries, pseries_dlpar_init); + +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 0e8db6771252..ef8e45448480 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -63,22 +63,6 @@ static void print_device_node_tree(struct pci_dn *pdn, int dent) } #endif -/** - * irq_in_use - return true if this irq is being used - */ -static int irq_in_use(unsigned int irq) -{ - int rc = 0; - unsigned long flags; - struct irq_desc *desc = irq_desc + irq; - - spin_lock_irqsave(&desc->lock, flags); - if (desc->action) - rc = 1; - spin_unlock_irqrestore(&desc->lock, flags); - return rc; -} - /** * eeh_disable_irq - disable interrupt for the recovering device */ @@ -93,7 +77,7 @@ static void eeh_disable_irq(struct pci_dev *dev) if (dev->msi_enabled || dev->msix_enabled) return; - if (!irq_in_use(dev->irq)) + if (!irq_has_action(dev->irq)) return; PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index ebff6d9a4e39..6ea4698d9176 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -30,6 +30,7 @@ #include <asm/pSeries_reconfig.h> #include "xics.h" #include "plpar_wrappers.h" +#include "offline_states.h" /* This version can't take the spinlock, because it never returns */ static struct rtas_args rtas_stop_self_args = { @@ -39,6 +40,55 @@ static struct rtas_args rtas_stop_self_args = { .rets = &rtas_stop_self_args.args[0], }; +static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = + CPU_STATE_OFFLINE; +static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; + +static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; + +static int cede_offline_enabled __read_mostly = 1; + +/* + * Enable/disable cede_offline when available. + */ +static int __init setup_cede_offline(char *str) +{ + if (!strcmp(str, "off")) + cede_offline_enabled = 0; + else if (!strcmp(str, "on")) + cede_offline_enabled = 1; + else + return 0; + return 1; +} + +__setup("cede_offline=", setup_cede_offline); + +enum cpu_state_vals get_cpu_current_state(int cpu) +{ + return per_cpu(current_state, cpu); +} + +void set_cpu_current_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(current_state, cpu) = state; +} + +enum cpu_state_vals get_preferred_offline_state(int cpu) +{ + return per_cpu(preferred_offline_state, cpu); +} + +void set_preferred_offline_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(preferred_offline_state, cpu) = state; +} + +void set_default_offline_state(int cpu) +{ + per_cpu(preferred_offline_state, cpu) = default_offline_state; +} + static void rtas_stop_self(void) { struct rtas_args *args = &rtas_stop_self_args; @@ -56,11 +106,61 @@ static void rtas_stop_self(void) static void pseries_mach_cpu_die(void) { + unsigned int cpu = smp_processor_id(); + unsigned int hwcpu = hard_smp_processor_id(); + u8 cede_latency_hint = 0; + local_irq_disable(); idle_task_exit(); xics_teardown_cpu(); - unregister_slb_shadow(hard_smp_processor_id(), __pa(get_slb_shadow())); - rtas_stop_self(); + + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + set_cpu_current_state(cpu, CPU_STATE_INACTIVE); + cede_latency_hint = 2; + + get_lppaca()->idle = 1; + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 1; + + printk(KERN_INFO + "cpu %u (hwid %u) ceding for offline with hint %d\n", + cpu, hwcpu, cede_latency_hint); + while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + extended_cede_processor(cede_latency_hint); + printk(KERN_INFO "cpu %u (hwid %u) returned from cede.\n", + cpu, hwcpu); + printk(KERN_INFO + "Decrementer value = %x Timebase value = %llx\n", + get_dec(), get_tb()); + } + + printk(KERN_INFO "cpu %u (hwid %u) got prodded to go online\n", + cpu, hwcpu); + + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->idle = 0; + } + + if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + + /* + * NOTE: Calling start_secondary() here for now to + * start new context. + * However, need to do it cleanly by resetting the + * stack pointer. + */ + start_secondary(); + + } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + + set_cpu_current_state(cpu, CPU_STATE_OFFLINE); + unregister_slb_shadow(hard_smp_processor_id(), + __pa(get_slb_shadow())); + rtas_stop_self(); + } + /* Should never get here... */ BUG(); for(;;); @@ -106,18 +206,43 @@ static int pseries_cpu_disable(void) return 0; } +/* + * pseries_cpu_die: Wait for the cpu to die. + * @cpu: logical processor id of the CPU whose death we're awaiting. + * + * This function is called from the context of the thread which is performing + * the cpu-offline. Here we wait for long enough to allow the cpu in question + * to self-destroy so that the cpu-offline thread can send the CPU_DEAD + * notifications. + * + * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to + * self-destruct. + */ static void pseries_cpu_die(unsigned int cpu) { int tries; - int cpu_status; + int cpu_status = 1; unsigned int pcpu = get_hard_smp_processor_id(cpu); - for (tries = 0; tries < 25; tries++) { - cpu_status = query_cpu_stopped(pcpu); - if (cpu_status == 0 || cpu_status == -1) - break; - cpu_relax(); + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 1; + for (tries = 0; tries < 1000; tries++) { + if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 0; + break; + } + cpu_relax(); + } + } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + + for (tries = 0; tries < 25; tries++) { + cpu_status = query_cpu_stopped(pcpu); + if (cpu_status == 0 || cpu_status == -1) + break; + cpu_relax(); + } } + if (cpu_status != 0) { printk("Querying DEAD? cpu %i (%i) shows %i\n", cpu, pcpu, cpu_status); @@ -252,10 +377,41 @@ static struct notifier_block pseries_smp_nb = { .notifier_call = pseries_smp_notifier, }; +#define MAX_CEDE_LATENCY_LEVELS 4 +#define CEDE_LATENCY_PARAM_LENGTH 10 +#define CEDE_LATENCY_PARAM_MAX_LENGTH \ + (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) +#define CEDE_LATENCY_TOKEN 45 + +static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; + +static int parse_cede_parameters(void) +{ + int call_status; + + memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + CEDE_LATENCY_TOKEN, + __pa(cede_parameters), + CEDE_LATENCY_PARAM_MAX_LENGTH); + + if (call_status != 0) + printk(KERN_INFO "CEDE_LATENCY: \ + %s %s Error calling get-system-parameter(0x%x)\n", + __FILE__, __func__, call_status); + else + printk(KERN_INFO "CEDE_LATENCY: \ + get-system-parameter successful.\n"); + + return call_status; +} + static int __init pseries_cpu_hotplug_init(void) { struct device_node *np; const char *typep; + int cpu; for_each_node_by_name(np, "interrupt-controller") { typep = of_get_property(np, "compatible", NULL); @@ -283,8 +439,16 @@ static int __init pseries_cpu_hotplug_init(void) smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (firmware_has_feature(FW_FEATURE_LPAR)) { pSeries_reconfig_notifier_register(&pseries_smp_nb); + cpu_maps_update_begin(); + if (cede_offline_enabled && parse_cede_parameters() == 0) { + default_offline_state = CPU_STATE_INACTIVE; + for_each_online_cpu(cpu) + set_default_offline_state(cpu); + } + cpu_maps_update_done(); + } return 0; } diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index c1427b3634ec..383a5d0e9818 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -14,68 +14,94 @@ #define STK_PARM(i) (48 + ((i)-3)*8) -#ifdef CONFIG_HCALL_STATS +#ifdef CONFIG_TRACEPOINTS + + .section ".toc","aw" + + .globl hcall_tracepoint_refcount +hcall_tracepoint_refcount: + .llong 0 + + .section ".text" + /* * precall must preserve all registers. use unused STK_PARM() - * areas to save snapshots and opcode. + * areas to save snapshots and opcode. We branch around this + * in early init (eg when populating the MMU hashtable) by using an + * unconditional cpu feature. */ -#define HCALL_INST_PRECALL \ - std r3,STK_PARM(r3)(r1); /* save opcode */ \ - mftb r0; /* get timebase and */ \ - std r0,STK_PARM(r5)(r1); /* save for later */ \ +#define HCALL_INST_PRECALL(FIRST_REG) \ BEGIN_FTR_SECTION; \ - mfspr r0,SPRN_PURR; /* get PURR and */ \ - std r0,STK_PARM(r6)(r1); /* save for later */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); - + b 1f; \ +END_FTR_SECTION(0, 1); \ + ld r12,hcall_tracepoint_refcount@toc(r2); \ + cmpdi r12,0; \ + beq+ 1f; \ + mflr r0; \ + std r3,STK_PARM(r3)(r1); \ + std r4,STK_PARM(r4)(r1); \ + std r5,STK_PARM(r5)(r1); \ + std r6,STK_PARM(r6)(r1); \ + std r7,STK_PARM(r7)(r1); \ + std r8,STK_PARM(r8)(r1); \ + std r9,STK_PARM(r9)(r1); \ + std r10,STK_PARM(r10)(r1); \ + std r0,16(r1); \ + addi r4,r1,STK_PARM(FIRST_REG); \ + stdu r1,-STACK_FRAME_OVERHEAD(r1); \ + bl .__trace_hcall_entry; \ + addi r1,r1,STACK_FRAME_OVERHEAD; \ + ld r0,16(r1); \ + ld r3,STK_PARM(r3)(r1); \ + ld r4,STK_PARM(r4)(r1); \ + ld r5,STK_PARM(r5)(r1); \ + ld r6,STK_PARM(r6)(r1); \ + ld r7,STK_PARM(r7)(r1); \ + ld r8,STK_PARM(r8)(r1); \ + ld r9,STK_PARM(r9)(r1); \ + ld r10,STK_PARM(r10)(r1); \ + mtlr r0; \ +1: + /* * postcall is performed immediately before function return which * allows liberal use of volatile registers. We branch around this * in early init (eg when populating the MMU hashtable) by using an * unconditional cpu feature. */ -#define HCALL_INST_POSTCALL \ +#define __HCALL_INST_POSTCALL \ BEGIN_FTR_SECTION; \ b 1f; \ END_FTR_SECTION(0, 1); \ - ld r4,STK_PARM(r3)(r1); /* validate opcode */ \ - cmpldi cr7,r4,MAX_HCALL_OPCODE; \ - bgt- cr7,1f; \ - \ - /* get time and PURR snapshots after hcall */ \ - mftb r7; /* timebase after */ \ -BEGIN_FTR_SECTION; \ - mfspr r8,SPRN_PURR; /* PURR after */ \ - ld r6,STK_PARM(r6)(r1); /* PURR before */ \ - subf r6,r6,r8; /* delta */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ - ld r5,STK_PARM(r5)(r1); /* timebase before */ \ - subf r5,r5,r7; /* time delta */ \ - \ - /* calculate address of stat structure r4 = opcode */ \ - srdi r4,r4,2; /* index into array */ \ - mulli r4,r4,HCALL_STAT_SIZE; \ - LOAD_REG_ADDR(r7, per_cpu__hcall_stats); \ - add r4,r4,r7; \ - ld r7,PACA_DATA_OFFSET(r13); /* per cpu offset */ \ - add r4,r4,r7; \ - \ - /* update stats */ \ - ld r7,HCALL_STAT_CALLS(r4); /* count */ \ - addi r7,r7,1; \ - std r7,HCALL_STAT_CALLS(r4); \ - ld r7,HCALL_STAT_TB(r4); /* timebase */ \ - add r7,r7,r5; \ - std r7,HCALL_STAT_TB(r4); \ -BEGIN_FTR_SECTION; \ - ld r7,HCALL_STAT_PURR(r4); /* PURR */ \ - add r7,r7,r6; \ - std r7,HCALL_STAT_PURR(r4); \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ + ld r12,hcall_tracepoint_refcount@toc(r2); \ + cmpdi r12,0; \ + beq+ 1f; \ + mflr r0; \ + ld r6,STK_PARM(r3)(r1); \ + std r3,STK_PARM(r3)(r1); \ + mr r4,r3; \ + mr r3,r6; \ + std r0,16(r1); \ + stdu r1,-STACK_FRAME_OVERHEAD(r1); \ + bl .__trace_hcall_exit; \ + addi r1,r1,STACK_FRAME_OVERHEAD; \ + ld r0,16(r1); \ + ld r3,STK_PARM(r3)(r1); \ + mtlr r0; \ 1: + +#define HCALL_INST_POSTCALL_NORETS \ + li r5,0; \ + __HCALL_INST_POSTCALL + +#define HCALL_INST_POSTCALL(BUFREG) \ + mr r5,BUFREG; \ + __HCALL_INST_POSTCALL + #else -#define HCALL_INST_PRECALL -#define HCALL_INST_POSTCALL +#define HCALL_INST_PRECALL(FIRST_ARG) +#define HCALL_INST_POSTCALL_NORETS +#define HCALL_INST_POSTCALL(BUFREG) #endif .text @@ -86,11 +112,11 @@ _GLOBAL(plpar_hcall_norets) mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL + HCALL_INST_PRECALL(r4) HVSC /* invoke the hypervisor */ - HCALL_INST_POSTCALL + HCALL_INST_POSTCALL_NORETS lwz r0,8(r1) mtcrf 0xff,r0 @@ -102,7 +128,7 @@ _GLOBAL(plpar_hcall) mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL + HCALL_INST_PRECALL(r5) std r4,STK_PARM(r4)(r1) /* Save ret buffer */ @@ -121,7 +147,7 @@ _GLOBAL(plpar_hcall) std r6, 16(r12) std r7, 24(r12) - HCALL_INST_POSTCALL + HCALL_INST_POSTCALL(r12) lwz r0,8(r1) mtcrf 0xff,r0 @@ -168,7 +194,7 @@ _GLOBAL(plpar_hcall9) mfcr r0 stw r0,8(r1) - HCALL_INST_PRECALL + HCALL_INST_PRECALL(r5) std r4,STK_PARM(r4)(r1) /* Save ret buffer */ @@ -196,7 +222,7 @@ _GLOBAL(plpar_hcall9) std r11,56(r12) std r0, 64(r12) - HCALL_INST_POSTCALL + HCALL_INST_POSTCALL(r12) lwz r0,8(r1) mtcrf 0xff,r0 diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index 3631a4f277eb..2f58c71b7259 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -26,6 +26,7 @@ #include <asm/hvcall.h> #include <asm/firmware.h> #include <asm/cputable.h> +#include <asm/trace.h> DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); @@ -100,6 +101,35 @@ static const struct file_operations hcall_inst_seq_fops = { #define HCALL_ROOT_DIR "hcall_inst" #define CPU_NAME_BUF_SIZE 32 + +static void probe_hcall_entry(unsigned long opcode, unsigned long *args) +{ + struct hcall_stats *h; + + if (opcode > MAX_HCALL_OPCODE) + return; + + h = &get_cpu_var(hcall_stats)[opcode / 4]; + h->tb_start = mftb(); + h->purr_start = mfspr(SPRN_PURR); +} + +static void probe_hcall_exit(unsigned long opcode, unsigned long retval, + unsigned long *retbuf) +{ + struct hcall_stats *h; + + if (opcode > MAX_HCALL_OPCODE) + return; + + h = &__get_cpu_var(hcall_stats)[opcode / 4]; + h->num_calls++; + h->tb_total = mftb() - h->tb_start; + h->purr_total = mfspr(SPRN_PURR) - h->purr_start; + + put_cpu_var(hcall_stats); +} + static int __init hcall_inst_init(void) { struct dentry *hcall_root; @@ -110,6 +140,14 @@ static int __init hcall_inst_init(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) return 0; + if (register_trace_hcall_entry(probe_hcall_entry)) + return -EINVAL; + + if (register_trace_hcall_exit(probe_hcall_exit)) { + unregister_trace_hcall_entry(probe_hcall_entry); + return -EINVAL; + } + hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL); if (!hcall_root) return -ENOMEM; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 903eb9eec687..0707653612ba 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -39,6 +39,7 @@ #include <asm/cputable.h> #include <asm/udbg.h> #include <asm/smp.h> +#include <asm/trace.h> #include "plpar_wrappers.h" #include "pseries.h" @@ -661,3 +662,35 @@ void arch_free_page(struct page *page, int order) EXPORT_SYMBOL(arch_free_page); #endif + +#ifdef CONFIG_TRACEPOINTS +/* + * We optimise our hcall path by placing hcall_tracepoint_refcount + * directly in the TOC so we can check if the hcall tracepoints are + * enabled via a single load. + */ + +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ +extern long hcall_tracepoint_refcount; + +void hcall_tracepoint_regfunc(void) +{ + hcall_tracepoint_refcount++; +} + +void hcall_tracepoint_unregfunc(void) +{ + hcall_tracepoint_refcount--; +} + +void __trace_hcall_entry(unsigned long opcode, unsigned long *args) +{ + trace_hcall_entry(opcode, args); +} + +void __trace_hcall_exit(long opcode, unsigned long retval, + unsigned long *retbuf) +{ + trace_hcall_exit(opcode, retval, retbuf); +} +#endif diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h new file mode 100644 index 000000000000..22574e0d9d91 --- /dev/null +++ b/arch/powerpc/platforms/pseries/offline_states.h @@ -0,0 +1,18 @@ +#ifndef _OFFLINE_STATES_H_ +#define _OFFLINE_STATES_H_ + +/* Cpu offline states go here */ +enum cpu_state_vals { + CPU_STATE_OFFLINE, + CPU_STATE_INACTIVE, + CPU_STATE_ONLINE, + CPU_MAX_OFFLINE_STATES +}; + +extern enum cpu_state_vals get_cpu_current_state(int cpu); +extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); +extern enum cpu_state_vals get_preferred_offline_state(int cpu); +extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); +extern void set_default_offline_state(int cpu); +extern int start_secondary(void); +#endif diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h index a24a6b2333b2..0603c91538ae 100644 --- a/arch/powerpc/platforms/pseries/plpar_wrappers.h +++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h @@ -9,11 +9,33 @@ static inline long poll_pending(void) return plpar_hcall_norets(H_POLL_PENDING); } +static inline u8 get_cede_latency_hint(void) +{ + return get_lppaca()->gpr5_dword.fields.cede_latency_hint; +} + +static inline void set_cede_latency_hint(u8 latency_hint) +{ + get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint; +} + static inline long cede_processor(void) { return plpar_hcall_norets(H_CEDE); } +static inline long extended_cede_processor(unsigned long latency_hint) +{ + long rc; + u8 old_latency_hint = get_cede_latency_hint(); + + set_cede_latency_hint(latency_hint); + rc = cede_processor(); + set_cede_latency_hint(old_latency_hint); + + return rc; +} + static inline long vpa_call(unsigned long flags, unsigned long cpu, unsigned long vpa) { diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 2e2bbe120b90..a2305d29bbbd 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -96,7 +96,7 @@ static struct device_node *derive_parent(const char *path) return parent; } -static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); +BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); int pSeries_reconfig_notifier_register(struct notifier_block *nb) { @@ -184,7 +184,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np) } /* - * /proc/ppc64/ofdt - yucky binary interface for adding and removing + * /proc/powerpc/ofdt - yucky binary interface for adding and removing * OF device nodes. Should be deprecated as soon as we get an * in-kernel wrapper for the RTAS ibm,configure-connector call. */ @@ -543,7 +543,7 @@ static const struct file_operations ofdt_fops = { .write = ofdt_write }; -/* create /proc/ppc64/ofdt write-only by root */ +/* create /proc/powerpc/ofdt write-only by root */ static int proc_ppc64_create_ofdt(void) { struct proc_dir_entry *ent; @@ -551,7 +551,7 @@ static int proc_ppc64_create_ofdt(void) if (!machine_is(pseries)) return 0; - ent = proc_create("ppc64/ofdt", S_IWUSR, NULL, &ofdt_fops); + ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops); if (ent) ent->size = 0; diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c deleted file mode 100644 index b3cbac855924..000000000000 --- a/arch/powerpc/platforms/pseries/rtasd.c +++ /dev/null @@ -1,519 +0,0 @@ -/* - * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Communication to userspace based on kernel/printk.c - */ - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/poll.h> -#include <linux/proc_fs.h> -#include <linux/init.h> -#include <linux/vmalloc.h> -#include <linux/spinlock.h> -#include <linux/cpu.h> -#include <linux/workqueue.h> - -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/rtas.h> -#include <asm/prom.h> -#include <asm/nvram.h> -#include <asm/atomic.h> -#include <asm/machdep.h> - - -static DEFINE_SPINLOCK(rtasd_log_lock); - -static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait); - -static char *rtas_log_buf; -static unsigned long rtas_log_start; -static unsigned long rtas_log_size; - -static int surveillance_timeout = -1; -static unsigned int rtas_error_log_max; -static unsigned int rtas_error_log_buffer_max; - -/* RTAS service tokens */ -static unsigned int event_scan; -static unsigned int rtas_event_scan_rate; - -static int full_rtas_msgs = 0; - -/* Stop logging to nvram after first fatal error */ -static int logging_enabled; /* Until we initialize everything, - * make sure we don't try logging - * anything */ -static int error_log_cnt; - -/* - * Since we use 32 bit RTAS, the physical address of this must be below - * 4G or else bad things happen. Allocate this in the kernel data and - * make it big enough. - */ -static unsigned char logdata[RTAS_ERROR_LOG_MAX]; - -static char *rtas_type[] = { - "Unknown", "Retry", "TCE Error", "Internal Device Failure", - "Timeout", "Data Parity", "Address Parity", "Cache Parity", - "Address Invalid", "ECC Uncorrected", "ECC Corrupted", -}; - -static char *rtas_event_type(int type) -{ - if ((type > 0) && (type < 11)) - return rtas_type[type]; - - switch (type) { - case RTAS_TYPE_EPOW: - return "EPOW"; - case RTAS_TYPE_PLATFORM: - return "Platform Error"; - case RTAS_TYPE_IO: - return "I/O Event"; - case RTAS_TYPE_INFO: - return "Platform Information Event"; - case RTAS_TYPE_DEALLOC: - return "Resource Deallocation Event"; - case RTAS_TYPE_DUMP: - return "Dump Notification Event"; - } - - return rtas_type[0]; -} - -/* To see this info, grep RTAS /var/log/messages and each entry - * will be collected together with obvious begin/end. - * There will be a unique identifier on the begin and end lines. - * This will persist across reboots. - * - * format of error logs returned from RTAS: - * bytes (size) : contents - * -------------------------------------------------------- - * 0-7 (8) : rtas_error_log - * 8-47 (40) : extended info - * 48-51 (4) : vendor id - * 52-1023 (vendor specific) : location code and debug data - */ -static void printk_log_rtas(char *buf, int len) -{ - - int i,j,n = 0; - int perline = 16; - char buffer[64]; - char * str = "RTAS event"; - - if (full_rtas_msgs) { - printk(RTAS_DEBUG "%d -------- %s begin --------\n", - error_log_cnt, str); - - /* - * Print perline bytes on each line, each line will start - * with RTAS and a changing number, so syslogd will - * print lines that are otherwise the same. Separate every - * 4 bytes with a space. - */ - for (i = 0; i < len; i++) { - j = i % perline; - if (j == 0) { - memset(buffer, 0, sizeof(buffer)); - n = sprintf(buffer, "RTAS %d:", i/perline); - } - - if ((i % 4) == 0) - n += sprintf(buffer+n, " "); - - n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]); - - if (j == (perline-1)) - printk(KERN_DEBUG "%s\n", buffer); - } - if ((i % perline) != 0) - printk(KERN_DEBUG "%s\n", buffer); - - printk(RTAS_DEBUG "%d -------- %s end ----------\n", - error_log_cnt, str); - } else { - struct rtas_error_log *errlog = (struct rtas_error_log *)buf; - - printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n", - error_log_cnt, rtas_event_type(errlog->type), - errlog->severity); - } -} - -static int log_rtas_len(char * buf) -{ - int len; - struct rtas_error_log *err; - - /* rtas fixed header */ - len = 8; - err = (struct rtas_error_log *)buf; - if (err->extended_log_length) { - - /* extended header */ - len += err->extended_log_length; - } - - if (rtas_error_log_max == 0) - rtas_error_log_max = rtas_get_error_log_max(); - - if (len > rtas_error_log_max) - len = rtas_error_log_max; - - return len; -} - -/* - * First write to nvram, if fatal error, that is the only - * place we log the info. The error will be picked up - * on the next reboot by rtasd. If not fatal, run the - * method for the type of error. Currently, only RTAS - * errors have methods implemented, but in the future - * there might be a need to store data in nvram before a - * call to panic(). - * - * XXX We write to nvram periodically, to indicate error has - * been written and sync'd, but there is a possibility - * that if we don't shutdown correctly, a duplicate error - * record will be created on next reboot. - */ -void pSeries_log_error(char *buf, unsigned int err_type, int fatal) -{ - unsigned long offset; - unsigned long s; - int len = 0; - - pr_debug("rtasd: logging event\n"); - if (buf == NULL) - return; - - spin_lock_irqsave(&rtasd_log_lock, s); - - /* get length and increase count */ - switch (err_type & ERR_TYPE_MASK) { - case ERR_TYPE_RTAS_LOG: - len = log_rtas_len(buf); - if (!(err_type & ERR_FLAG_BOOT)) - error_log_cnt++; - break; - case ERR_TYPE_KERNEL_PANIC: - default: - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ - spin_unlock_irqrestore(&rtasd_log_lock, s); - return; - } - - /* Write error to NVRAM */ - if (logging_enabled && !(err_type & ERR_FLAG_BOOT)) - nvram_write_error_log(buf, len, err_type, error_log_cnt); - - /* - * rtas errors can occur during boot, and we do want to capture - * those somewhere, even if nvram isn't ready (why not?), and even - * if rtasd isn't ready. Put them into the boot log, at least. - */ - if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG) - printk_log_rtas(buf, len); - - /* Check to see if we need to or have stopped logging */ - if (fatal || !logging_enabled) { - logging_enabled = 0; - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ - spin_unlock_irqrestore(&rtasd_log_lock, s); - return; - } - - /* call type specific method for error */ - switch (err_type & ERR_TYPE_MASK) { - case ERR_TYPE_RTAS_LOG: - offset = rtas_error_log_buffer_max * - ((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK); - - /* First copy over sequence number */ - memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int)); - - /* Second copy over error log data */ - offset += sizeof(int); - memcpy(&rtas_log_buf[offset], buf, len); - - if (rtas_log_size < LOG_NUMBER) - rtas_log_size += 1; - else - rtas_log_start += 1; - - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ - spin_unlock_irqrestore(&rtasd_log_lock, s); - wake_up_interruptible(&rtas_log_wait); - break; - case ERR_TYPE_KERNEL_PANIC: - default: - WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */ - spin_unlock_irqrestore(&rtasd_log_lock, s); - return; - } - -} - - -static int rtas_log_open(struct inode * inode, struct file * file) -{ - return 0; -} - -static int rtas_log_release(struct inode * inode, struct file * file) -{ - return 0; -} - -/* This will check if all events are logged, if they are then, we - * know that we can safely clear the events in NVRAM. - * Next we'll sit and wait for something else to log. - */ -static ssize_t rtas_log_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - int error; - char *tmp; - unsigned long s; - unsigned long offset; - - if (!buf || count < rtas_error_log_buffer_max) - return -EINVAL; - - count = rtas_error_log_buffer_max; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - spin_lock_irqsave(&rtasd_log_lock, s); - /* if it's 0, then we know we got the last one (the one in NVRAM) */ - while (rtas_log_size == 0) { - if (file->f_flags & O_NONBLOCK) { - spin_unlock_irqrestore(&rtasd_log_lock, s); - error = -EAGAIN; - goto out; - } - - if (!logging_enabled) { - spin_unlock_irqrestore(&rtasd_log_lock, s); - error = -ENODATA; - goto out; - } - nvram_clear_error_log(); - - spin_unlock_irqrestore(&rtasd_log_lock, s); - error = wait_event_interruptible(rtas_log_wait, rtas_log_size); - if (error) - goto out; - spin_lock_irqsave(&rtasd_log_lock, s); - } - - offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK); - memcpy(tmp, &rtas_log_buf[offset], count); - - rtas_log_start += 1; - rtas_log_size -= 1; - spin_unlock_irqrestore(&rtasd_log_lock, s); - - error = copy_to_user(buf, tmp, count) ? -EFAULT : count; -out: - kfree(tmp); - return error; -} - -static unsigned int rtas_log_poll(struct file *file, poll_table * wait) -{ - poll_wait(file, &rtas_log_wait, wait); - if (rtas_log_size) - return POLLIN | POLLRDNORM; - return 0; -} - -static const struct file_operations proc_rtas_log_operations = { - .read = rtas_log_read, - .poll = rtas_log_poll, - .open = rtas_log_open, - .release = rtas_log_release, -}; - -static int enable_surveillance(int timeout) -{ - int error; - - error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout); - - if (error == 0) - return 0; - - if (error == -EINVAL) { - printk(KERN_DEBUG "rtasd: surveillance not supported\n"); - return 0; - } - - printk(KERN_ERR "rtasd: could not update surveillance\n"); - return -1; -} - -static void do_event_scan(void) -{ - int error; - do { - memset(logdata, 0, rtas_error_log_max); - error = rtas_call(event_scan, 4, 1, NULL, - RTAS_EVENT_SCAN_ALL_EVENTS, 0, - __pa(logdata), rtas_error_log_max); - if (error == -1) { - printk(KERN_ERR "event-scan failed\n"); - break; - } - - if (error == 0) - pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0); - - } while(error == 0); -} - -static void rtas_event_scan(struct work_struct *w); -DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); - -/* - * Delay should be at least one second since some machines have problems if - * we call event-scan too quickly. - */ -static unsigned long event_scan_delay = 1*HZ; -static int first_pass = 1; - -static void rtas_event_scan(struct work_struct *w) -{ - unsigned int cpu; - - do_event_scan(); - - get_online_cpus(); - - cpu = next_cpu(smp_processor_id(), cpu_online_map); - if (cpu == NR_CPUS) { - cpu = first_cpu(cpu_online_map); - - if (first_pass) { - first_pass = 0; - event_scan_delay = 30*HZ/rtas_event_scan_rate; - - if (surveillance_timeout != -1) { - pr_debug("rtasd: enabling surveillance\n"); - enable_surveillance(surveillance_timeout); - pr_debug("rtasd: surveillance enabled\n"); - } - } - } - - schedule_delayed_work_on(cpu, &event_scan_work, - __round_jiffies_relative(event_scan_delay, cpu)); - - put_online_cpus(); -} - -static void start_event_scan(void) -{ - unsigned int err_type; - int rc; - - printk(KERN_DEBUG "RTAS daemon started\n"); - pr_debug("rtasd: will sleep for %d milliseconds\n", - (30000 / rtas_event_scan_rate)); - - /* See if we have any error stored in NVRAM */ - memset(logdata, 0, rtas_error_log_max); - rc = nvram_read_error_log(logdata, rtas_error_log_max, - &err_type, &error_log_cnt); - /* We can use rtas_log_buf now */ - logging_enabled = 1; - - if (!rc) { - if (err_type != ERR_FLAG_ALREADY_LOGGED) { - pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0); - } - } - - schedule_delayed_work_on(first_cpu(cpu_online_map), &event_scan_work, - event_scan_delay); -} - -static int __init rtas_init(void) -{ - struct proc_dir_entry *entry; - - if (!machine_is(pseries)) - return 0; - - /* No RTAS */ - event_scan = rtas_token("event-scan"); - if (event_scan == RTAS_UNKNOWN_SERVICE) { - printk(KERN_DEBUG "rtasd: no event-scan on system\n"); - return -ENODEV; - } - - rtas_event_scan_rate = rtas_token("rtas-event-scan-rate"); - if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) { - printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n"); - return -ENODEV; - } - - /* Make room for the sequence number */ - rtas_error_log_max = rtas_get_error_log_max(); - rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int); - - rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER); - if (!rtas_log_buf) { - printk(KERN_ERR "rtasd: no memory\n"); - return -ENOMEM; - } - - entry = proc_create("ppc64/rtas/error_log", S_IRUSR, NULL, - &proc_rtas_log_operations); - if (!entry) - printk(KERN_ERR "Failed to create error_log proc entry\n"); - - start_event_scan(); - - return 0; -} - -static int __init surveillance_setup(char *str) -{ - int i; - - if (get_option(&str,&i)) { - if (i >= 0 && i <= 255) - surveillance_timeout = i; - } - - return 1; -} - -static int __init rtasmsgs_setup(char *str) -{ - if (strcmp(str, "on") == 0) - full_rtas_msgs = 1; - else if (strcmp(str, "off") == 0) - full_rtas_msgs = 0; - - return 1; -} -__initcall(rtas_init); -__setup("surveillance=", surveillance_setup); -__setup("rtasmsgs=", rtasmsgs_setup); diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index 417eca79df69..1b45c458f952 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -13,7 +13,7 @@ * of this data using this driver. A dump exists if the device-tree * /chosen/ibm,scan-log-data property exists. * - * This driver exports /proc/ppc64/scan-log-dump which can be read. + * This driver exports /proc/powerpc/scan-log-dump which can be read. * The driver supports only sequential reads. * * The driver looks at a write to the driver for the single word "reset". @@ -186,7 +186,7 @@ static int __init scanlog_init(void) if (!data) goto err; - ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL, + ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, &scanlog_fops, data); if (!ent) goto err; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 440000cc7130..8868c012268a 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -48,6 +48,7 @@ #include "plpar_wrappers.h" #include "pseries.h" #include "xics.h" +#include "offline_states.h" /* @@ -84,6 +85,9 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca[lcpu].__current)->preempt_count = 0; + if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) + goto out; + /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -98,6 +102,7 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu) return 0; } +out: return 1; } @@ -111,12 +116,16 @@ static void __devinit smp_xics_setup_cpu(int cpu) vpa_init(cpu); cpu_clear(cpu, of_spin_map); + set_cpu_current_state(cpu, CPU_STATE_ONLINE); + set_default_offline_state(cpu); } #endif /* CONFIG_XICS */ static void __devinit smp_pSeries_kick_cpu(int nr) { + long rc; + unsigned long hcpuid; BUG_ON(nr < 0 || nr >= NR_CPUS); if (!smp_startup_cpu(nr)) @@ -128,6 +137,16 @@ static void __devinit smp_pSeries_kick_cpu(int nr) * the processor will continue on to secondary_start */ paca[nr].cpu_start = 1; + + set_preferred_offline_state(nr, CPU_STATE_ONLINE); + + if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { + hcpuid = get_hard_smp_processor_id(nr); + rc = plpar_hcall_norets(H_PROD, hcpuid); + if (rc != H_SUCCESS) + panic("Error: Prod to wake up processor %d Ret= %ld\n", + nr, rc); + } } static int smp_pSeries_cpu_bootable(unsigned int nr) diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index b9bf0eedccf2..7d01b58f3989 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -20,6 +20,7 @@ #include <linux/cpu.h> #include <linux/msi.h> #include <linux/of.h> +#include <linux/percpu.h> #include <asm/firmware.h> #include <asm/io.h> @@ -46,6 +47,12 @@ static struct irq_host *xics_host; */ #define IPI_PRIORITY 4 +/* The least favored priority */ +#define LOWEST_PRIORITY 0xFF + +/* The number of priorities defined above */ +#define MAX_NUM_PRIORITIES 3 + static unsigned int default_server = 0xFF; static unsigned int default_distrib_server = 0; static unsigned int interrupt_server_size = 8; @@ -56,6 +63,12 @@ static int ibm_set_xive; static int ibm_int_on; static int ibm_int_off; +struct xics_cppr { + unsigned char stack[MAX_NUM_PRIORITIES]; + int index; +}; + +static DEFINE_PER_CPU(struct xics_cppr, xics_cppr); /* Direct hardware low level accessors */ @@ -157,7 +170,7 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check) cpumask_t cpumask; cpumask_t tmp = CPU_MASK_NONE; - cpumask_copy(&cpumask, irq_desc[virq].affinity); + cpumask_copy(&cpumask, irq_to_desc(virq)->affinity); if (!distribute_irqs) return default_server; @@ -284,6 +297,19 @@ static inline unsigned int xics_xirr_vector(unsigned int xirr) return xirr & 0x00ffffff; } +static void push_cppr(unsigned int vec) +{ + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + if (WARN_ON(os_cppr->index >= MAX_NUM_PRIORITIES - 1)) + return; + + if (vec == XICS_IPI) + os_cppr->stack[++os_cppr->index] = IPI_PRIORITY; + else + os_cppr->stack[++os_cppr->index] = DEFAULT_PRIORITY; +} + static unsigned int xics_get_irq_direct(void) { unsigned int xirr = direct_xirr_info_get(); @@ -294,8 +320,10 @@ static unsigned int xics_get_irq_direct(void) return NO_IRQ; irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) + if (likely(irq != NO_IRQ)) { + push_cppr(vec); return irq; + } /* We don't have a linux mapping, so have rtas mask it. */ xics_mask_unknown_vec(vec); @@ -315,8 +343,10 @@ static unsigned int xics_get_irq_lpar(void) return NO_IRQ; irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) + if (likely(irq != NO_IRQ)) { + push_cppr(vec); return irq; + } /* We don't have a linux mapping, so have RTAS mask it. */ xics_mask_unknown_vec(vec); @@ -326,12 +356,22 @@ static unsigned int xics_get_irq_lpar(void) return NO_IRQ; } +static unsigned char pop_cppr(void) +{ + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + if (WARN_ON(os_cppr->index < 1)) + return LOWEST_PRIORITY; + + return os_cppr->stack[--os_cppr->index]; +} + static void xics_eoi_direct(unsigned int virq) { unsigned int irq = (unsigned int)irq_map[virq].hwirq; iosync(); - direct_xirr_info_set((0xff << 24) | irq); + direct_xirr_info_set((pop_cppr() << 24) | irq); } static void xics_eoi_lpar(unsigned int virq) @@ -339,7 +379,7 @@ static void xics_eoi_lpar(unsigned int virq) unsigned int irq = (unsigned int)irq_map[virq].hwirq; iosync(); - lpar_xirr_info_set((0xff << 24) | irq); + lpar_xirr_info_set((pop_cppr() << 24) | irq); } static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) @@ -388,7 +428,7 @@ static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) } static struct irq_chip xics_pic_direct = { - .typename = " XICS ", + .name = " XICS ", .startup = xics_startup, .mask = xics_mask_irq, .unmask = xics_unmask_irq, @@ -397,7 +437,7 @@ static struct irq_chip xics_pic_direct = { }; static struct irq_chip xics_pic_lpar = { - .typename = " XICS ", + .name = " XICS ", .startup = xics_startup, .mask = xics_mask_irq, .unmask = xics_unmask_irq, @@ -428,13 +468,13 @@ static int xics_host_map(struct irq_host *h, unsigned int virq, /* Insert the interrupt mapping into the radix tree for fast lookup */ irq_radix_revmap_insert(xics_host, virq, hw); - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq); return 0; } static int xics_host_xlate(struct irq_host *h, struct device_node *ct, - u32 *intspec, unsigned int intsize, + const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_flags) { @@ -746,6 +786,12 @@ void __init xics_init_IRQ(void) static void xics_set_cpu_priority(unsigned char cppr) { + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + BUG_ON(os_cppr->index != 0); + + os_cppr->stack[os_cppr->index] = cppr; + if (firmware_has_feature(FW_FEATURE_LPAR)) lpar_cppr_info(cppr); else @@ -772,7 +818,7 @@ static void xics_set_cpu_giq(unsigned int gserver, unsigned int join) void xics_setup_cpu(void) { - xics_set_cpu_priority(0xff); + xics_set_cpu_priority(LOWEST_PRIORITY); xics_set_cpu_giq(default_distrib_server, 1); } @@ -852,7 +898,7 @@ void xics_migrate_irqs_away(void) /* We need to get IPIs still. */ if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) continue; - desc = get_irq_desc(virq); + desc = irq_to_desc(virq); /* We only need to migrate enabled IRQS */ if (desc == NULL || desc->chip == NULL @@ -881,7 +927,7 @@ void xics_migrate_irqs_away(void) virq, cpu); /* Reset affinity to all cpus */ - cpumask_setall(irq_desc[virq].affinity); + cpumask_setall(irq_to_desc(virq)->affinity); desc->chip->set_affinity(virq, cpu_all_mask); unlock: spin_unlock_irqrestore(&desc->lock, flags); |