diff options
Diffstat (limited to 'arch/powerpc')
-rw-r--r-- | arch/powerpc/kernel/Makefile | 5 | ||||
-rw-r--r-- | arch/powerpc/kernel/cpu_setup_power4.S | 233 | ||||
-rw-r--r-- | arch/powerpc/kernel/firmware.c | 45 | ||||
-rw-r--r-- | arch/powerpc/kernel/ioctl32.c | 49 | ||||
-rw-r--r-- | arch/powerpc/kernel/paca.c | 143 | ||||
-rw-r--r-- | arch/powerpc/kernel/ppc_ksyms.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kernel/prom.c | 9 | ||||
-rw-r--r-- | arch/powerpc/kernel/prom_init.c | 170 | ||||
-rw-r--r-- | arch/powerpc/kernel/signal_32.c | 1 | ||||
-rw-r--r-- | arch/powerpc/kernel/traps.c | 4 | ||||
-rw-r--r-- | arch/powerpc/kernel/vio.c | 27 | ||||
-rw-r--r-- | arch/powerpc/lib/bitops.c | 2 | ||||
-rw-r--r-- | arch/powerpc/mm/pgtable_64.c | 7 | ||||
-rw-r--r-- | arch/powerpc/platforms/chrp/setup.c | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/iseries/setup.c | 14 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/Makefile | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh.c | 1197 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/eeh_event.c | 155 | ||||
-rw-r--r-- | arch/powerpc/platforms/pseries/setup.c | 16 |
19 files changed, 2014 insertions, 70 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index cbdc14261655..92cfabf929bc 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -4,6 +4,7 @@ ifeq ($(CONFIG_PPC64),y) EXTRA_CFLAGS += -mno-minimal-toc +CFLAGS_ioctl32.o += -Ifs/ endif ifeq ($(CONFIG_PPC32),y) CFLAGS_prom_init.o += -fPIC @@ -13,7 +14,9 @@ endif obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ irq.o signal_32.o pmc.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ - signal_64.o ptrace32.o systbl.o + signal_64.o ptrace32.o systbl.o \ + paca.o ioctl32.o cpu_setup_power4.o \ + firmware.o obj-$(CONFIG_ALTIVEC) += vecemu.o vector.o obj-$(CONFIG_POWER4) += idle_power4.o obj-$(CONFIG_PPC_OF) += of_device.o diff --git a/arch/powerpc/kernel/cpu_setup_power4.S b/arch/powerpc/kernel/cpu_setup_power4.S new file mode 100644 index 000000000000..cca942fe6115 --- /dev/null +++ b/arch/powerpc/kernel/cpu_setup_power4.S @@ -0,0 +1,233 @@ +/* + * This file contains low level CPU setup functions. + * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <asm/processor.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cache.h> + +_GLOBAL(__970_cpu_preinit) + /* + * Do nothing if not running in HV mode + */ + mfmsr r0 + rldicl. r0,r0,4,63 + beqlr + + /* + * Deal only with PPC970 and PPC970FX. + */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 + beq 1f + cmpwi r0,0x3c + beq 1f + cmpwi r0,0x44 + bnelr +1: + + /* Make sure HID4:rm_ci is off before MMU is turned off, that large + * pages are enabled with HID4:61 and clear HID5:DCBZ_size and + * HID5:DCBZ32_ill + */ + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + rldimi r3,r0,2,61 /* clear bit 61 (lg_pg_en) */ + sync + mtspr SPRN_HID4,r3 + isync + sync + mfspr r3,SPRN_HID5 + rldimi r3,r0,6,56 /* clear bits 56 & 57 (DCBZ*) */ + sync + mtspr SPRN_HID5,r3 + isync + sync + + /* Setup some basic HID1 features */ + mfspr r0,SPRN_HID1 + li r3,0x1200 /* enable i-fetch cacheability */ + sldi r3,r3,44 /* and prefetch */ + or r0,r0,r3 + mtspr SPRN_HID1,r0 + mtspr SPRN_HID1,r0 + isync + + /* Clear HIOR */ + li r0,0 + sync + mtspr SPRN_HIOR,0 /* Clear interrupt prefix */ + isync + blr + +_GLOBAL(__setup_cpu_power4) + blr + +_GLOBAL(__setup_cpu_be) + /* Set large page sizes LP=0: 16MB, LP=1: 64KB */ + addi r3, 0, 0 + ori r3, r3, HID6_LB + sldi r3, r3, 32 + nor r3, r3, r3 + mfspr r4, SPRN_HID6 + and r4, r4, r3 + addi r3, 0, 0x02000 + sldi r3, r3, 32 + or r4, r4, r3 + mtspr SPRN_HID6, r4 + blr + +_GLOBAL(__setup_cpu_ppc970) + mfspr r0,SPRN_HID0 + li r11,5 /* clear DOZE and SLEEP */ + rldimi r0,r11,52,8 /* set NAP and DPM */ + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + sync + isync + blr + +/* Definitions for the table use to save CPU states */ +#define CS_HID0 0 +#define CS_HID1 8 +#define CS_HID4 16 +#define CS_HID5 24 +#define CS_SIZE 32 + + .data + .balign L1_CACHE_BYTES,0 +cpu_state_storage: + .space CS_SIZE + .balign L1_CACHE_BYTES,0 + .text + +/* Called in normal context to backup CPU 0 state. This + * does not include cache settings. This function is also + * called for machine sleep. This does not include the MMU + * setup, BATs, etc... but rather the "special" registers + * like HID0, HID1, HID4, etc... + */ +_GLOBAL(__save_cpu_setup) + /* Some CR fields are volatile, we back it up all */ + mfcr r7 + + /* Get storage ptr */ + LOADADDR(r5,cpu_state_storage) + + /* We only deal with 970 for now */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 + beq 1f + cmpwi r0,0x3c + beq 1f + cmpwi r0,0x44 + bne 2f + +1: /* Save HID0,1,4 and 5 */ + mfspr r3,SPRN_HID0 + std r3,CS_HID0(r5) + mfspr r3,SPRN_HID1 + std r3,CS_HID1(r5) + mfspr r3,SPRN_HID4 + std r3,CS_HID4(r5) + mfspr r3,SPRN_HID5 + std r3,CS_HID5(r5) + +2: + mtcr r7 + blr + +/* Called with no MMU context (typically MSR:IR/DR off) to + * restore CPU state as backed up by the previous + * function. This does not include cache setting + */ +_GLOBAL(__restore_cpu_setup) + /* Get storage ptr (FIXME when using anton reloc as we + * are running with translation disabled here + */ + LOADADDR(r5,cpu_state_storage) + + /* We only deal with 970 for now */ + mfspr r0,SPRN_PVR + srwi r0,r0,16 + cmpwi r0,0x39 + beq 1f + cmpwi r0,0x3c + beq 1f + cmpwi r0,0x44 + bnelr + +1: /* Before accessing memory, we make sure rm_ci is clear */ + li r0,0 + mfspr r3,SPRN_HID4 + rldimi r3,r0,40,23 /* clear bit 23 (rm_ci) */ + sync + mtspr SPRN_HID4,r3 + isync + sync + + /* Clear interrupt prefix */ + li r0,0 + sync + mtspr SPRN_HIOR,0 + isync + + /* Restore HID0 */ + ld r3,CS_HID0(r5) + sync + isync + mtspr SPRN_HID0,r3 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + mfspr r3,SPRN_HID0 + sync + isync + + /* Restore HID1 */ + ld r3,CS_HID1(r5) + sync + isync + mtspr SPRN_HID1,r3 + mtspr SPRN_HID1,r3 + sync + isync + + /* Restore HID4 */ + ld r3,CS_HID4(r5) + sync + isync + mtspr SPRN_HID4,r3 + sync + isync + + /* Restore HID5 */ + ld r3,CS_HID5(r5) + sync + isync + mtspr SPRN_HID5,r3 + sync + isync + blr + diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c new file mode 100644 index 000000000000..65eae752a527 --- /dev/null +++ b/arch/powerpc/kernel/firmware.c @@ -0,0 +1,45 @@ +/* + * Extracted from cputable.c + * + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * Copyright (C) 2005 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> + +#include <asm/firmware.h> + +unsigned long ppc64_firmware_features; + +#ifdef CONFIG_PPC_PSERIES +firmware_feature_t firmware_features_table[FIRMWARE_MAX_FEATURES] = { + {FW_FEATURE_PFT, "hcall-pft"}, + {FW_FEATURE_TCE, "hcall-tce"}, + {FW_FEATURE_SPRG0, "hcall-sprg0"}, + {FW_FEATURE_DABR, "hcall-dabr"}, + {FW_FEATURE_COPY, "hcall-copy"}, + {FW_FEATURE_ASR, "hcall-asr"}, + {FW_FEATURE_DEBUG, "hcall-debug"}, + {FW_FEATURE_PERF, "hcall-perf"}, + {FW_FEATURE_DUMP, "hcall-dump"}, + {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, + {FW_FEATURE_MIGRATE, "hcall-migrate"}, + {FW_FEATURE_PERFMON, "hcall-perfmon"}, + {FW_FEATURE_CRQ, "hcall-crq"}, + {FW_FEATURE_VIO, "hcall-vio"}, + {FW_FEATURE_RDMA, "hcall-rdma"}, + {FW_FEATURE_LLAN, "hcall-lLAN"}, + {FW_FEATURE_BULK, "hcall-bulk"}, + {FW_FEATURE_XDABR, "hcall-xdabr"}, + {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_SPLPAR, "hcall-splpar"}, +}; +#endif diff --git a/arch/powerpc/kernel/ioctl32.c b/arch/powerpc/kernel/ioctl32.c new file mode 100644 index 000000000000..3fa6a93adbd0 --- /dev/null +++ b/arch/powerpc/kernel/ioctl32.c @@ -0,0 +1,49 @@ +/* + * ioctl32.c: Conversion between 32bit and 64bit native ioctls. + * + * Based on sparc64 ioctl32.c by: + * + * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) + * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) + * + * ppc64 changes: + * + * Copyright (C) 2000 Ken Aaker (kdaaker@rchland.vnet.ibm.com) + * Copyright (C) 2001 Anton Blanchard (antonb@au.ibm.com) + * + * These routines maintain argument size conversion between 32bit and 64bit + * ioctls. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define INCLUDES +#include "compat_ioctl.c" +#include <linux/syscalls.h> + +#define CODE +#include "compat_ioctl.c" + +#define HANDLE_IOCTL(cmd,handler) { cmd, (ioctl_trans_handler_t)handler, NULL }, +#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl) + +#define IOCTL_TABLE_START \ + struct ioctl_trans ioctl_start[] = { +#define IOCTL_TABLE_END \ + }; + +IOCTL_TABLE_START +#include <linux/compat_ioctl.h> +#define DECLARES +#include "compat_ioctl.c" + +/* Little p (/dev/rtc, /dev/envctrl, etc.) */ +COMPATIBLE_IOCTL(_IOR('p', 20, int[7])) /* RTCGET */ +COMPATIBLE_IOCTL(_IOW('p', 21, int[7])) /* RTCSET */ + +IOCTL_TABLE_END + +int ioctl_table_size = ARRAY_SIZE(ioctl_start); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c new file mode 100644 index 000000000000..179948eb0580 --- /dev/null +++ b/arch/powerpc/kernel/paca.c @@ -0,0 +1,143 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/threads.h> +#include <linux/module.h> + +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/page.h> + +#include <asm/lppaca.h> +#include <asm/iseries/it_lp_queue.h> +#include <asm/paca.h> + +static union { + struct systemcfg data; + u8 page[PAGE_SIZE]; +} systemcfg_store __attribute__((__section__(".data.page.aligned"))); +struct systemcfg *systemcfg = &systemcfg_store.data; +EXPORT_SYMBOL(systemcfg); + + +/* This symbol is provided by the linker - let it fill in the paca + * field correctly */ +extern unsigned long __toc_start; + +/* The Paca is an array with one entry per processor. Each contains an + * lppaca, which contains the information shared between the + * hypervisor and Linux. Each also contains an ItLpRegSave area which + * is used by the hypervisor to save registers. + * On systems with hardware multi-threading, there are two threads + * per processor. The Paca array must contain an entry for each thread. + * The VPD Areas will give a max logical processors = 2 * max physical + * processors. The processor VPD array needs one entry per physical + * processor (not thread). + */ +#define PACA_INIT_COMMON(number, start, asrr, asrv) \ + .lock_token = 0x8000, \ + .paca_index = (number), /* Paca Index */ \ + .default_decr = 0x00ff0000, /* Initial Decr */ \ + .kernel_toc = (unsigned long)(&__toc_start) + 0x8000UL, \ + .stab_real = (asrr), /* Real pointer to segment table */ \ + .stab_addr = (asrv), /* Virt pointer to segment table */ \ + .cpu_start = (start), /* Processor start */ \ + .hw_cpu_id = 0xffff, \ + .lppaca = { \ + .desc = 0xd397d781, /* "LpPa" */ \ + .size = sizeof(struct lppaca), \ + .dyn_proc_status = 2, \ + .decr_val = 0x00ff0000, \ + .fpregs_in_use = 1, \ + .end_of_quantum = 0xfffffffffffffffful, \ + .slb_count = 64, \ + .vmxregs_in_use = 0, \ + }, \ + +#ifdef CONFIG_PPC_ISERIES +#define PACA_INIT_ISERIES(number) \ + .lppaca_ptr = &paca[number].lppaca, \ + .reg_save_ptr = &paca[number].reg_save, \ + .reg_save = { \ + .xDesc = 0xd397d9e2, /* "LpRS" */ \ + .xSize = sizeof(struct ItLpRegSave) \ + } + +#define PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 0, 0, 0) \ + PACA_INIT_ISERIES(number) \ +} + +#define BOOTCPU_PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 1, 0, (u64)&initial_stab) \ + PACA_INIT_ISERIES(number) \ +} + +#else +#define PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 0, 0, 0) \ +} + +#define BOOTCPU_PACA_INIT(number) \ +{ \ + PACA_INIT_COMMON(number, 1, STAB0_PHYS_ADDR, (u64)&initial_stab) \ +} +#endif + +struct paca_struct paca[] = { + BOOTCPU_PACA_INIT(0), +#if NR_CPUS > 1 + PACA_INIT( 1), PACA_INIT( 2), PACA_INIT( 3), +#if NR_CPUS > 4 + PACA_INIT( 4), PACA_INIT( 5), PACA_INIT( 6), PACA_INIT( 7), +#if NR_CPUS > 8 + PACA_INIT( 8), PACA_INIT( 9), PACA_INIT( 10), PACA_INIT( 11), + PACA_INIT( 12), PACA_INIT( 13), PACA_INIT( 14), PACA_INIT( 15), + PACA_INIT( 16), PACA_INIT( 17), PACA_INIT( 18), PACA_INIT( 19), + PACA_INIT( 20), PACA_INIT( 21), PACA_INIT( 22), PACA_INIT( 23), + PACA_INIT( 24), PACA_INIT( 25), PACA_INIT( 26), PACA_INIT( 27), + PACA_INIT( 28), PACA_INIT( 29), PACA_INIT( 30), PACA_INIT( 31), +#if NR_CPUS > 32 + PACA_INIT( 32), PACA_INIT( 33), PACA_INIT( 34), PACA_INIT( 35), + PACA_INIT( 36), PACA_INIT( 37), PACA_INIT( 38), PACA_INIT( 39), + PACA_INIT( 40), PACA_INIT( 41), PACA_INIT( 42), PACA_INIT( 43), + PACA_INIT( 44), PACA_INIT( 45), PACA_INIT( 46), PACA_INIT( 47), + PACA_INIT( 48), PACA_INIT( 49), PACA_INIT( 50), PACA_INIT( 51), + PACA_INIT( 52), PACA_INIT( 53), PACA_INIT( 54), PACA_INIT( 55), + PACA_INIT( 56), PACA_INIT( 57), PACA_INIT( 58), PACA_INIT( 59), + PACA_INIT( 60), PACA_INIT( 61), PACA_INIT( 62), PACA_INIT( 63), +#if NR_CPUS > 64 + PACA_INIT( 64), PACA_INIT( 65), PACA_INIT( 66), PACA_INIT( 67), + PACA_INIT( 68), PACA_INIT( 69), PACA_INIT( 70), PACA_INIT( 71), + PACA_INIT( 72), PACA_INIT( 73), PACA_INIT( 74), PACA_INIT( 75), + PACA_INIT( 76), PACA_INIT( 77), PACA_INIT( 78), PACA_INIT( 79), + PACA_INIT( 80), PACA_INIT( 81), PACA_INIT( 82), PACA_INIT( 83), + PACA_INIT( 84), PACA_INIT( 85), PACA_INIT( 86), PACA_INIT( 87), + PACA_INIT( 88), PACA_INIT( 89), PACA_INIT( 90), PACA_INIT( 91), + PACA_INIT( 92), PACA_INIT( 93), PACA_INIT( 94), PACA_INIT( 95), + PACA_INIT( 96), PACA_INIT( 97), PACA_INIT( 98), PACA_INIT( 99), + PACA_INIT(100), PACA_INIT(101), PACA_INIT(102), PACA_INIT(103), + PACA_INIT(104), PACA_INIT(105), PACA_INIT(106), PACA_INIT(107), + PACA_INIT(108), PACA_INIT(109), PACA_INIT(110), PACA_INIT(111), + PACA_INIT(112), PACA_INIT(113), PACA_INIT(114), PACA_INIT(115), + PACA_INIT(116), PACA_INIT(117), PACA_INIT(118), PACA_INIT(119), + PACA_INIT(120), PACA_INIT(121), PACA_INIT(122), PACA_INIT(123), + PACA_INIT(124), PACA_INIT(125), PACA_INIT(126), PACA_INIT(127), +#endif +#endif +#endif +#endif +#endif +}; +EXPORT_SYMBOL(paca); diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 47d6f7e2ea9f..5d9fd0369aad 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -44,6 +44,7 @@ #include <asm/cputable.h> #include <asm/btext.h> #include <asm/div64.h> +#include <asm/signal.h> #ifdef CONFIG_8xx #include <asm/commproc.h> @@ -56,7 +57,6 @@ extern void machine_check_exception(struct pt_regs *regs); extern void alignment_exception(struct pt_regs *regs); extern void program_check_exception(struct pt_regs *regs); extern void single_step_exception(struct pt_regs *regs); -extern int do_signal(sigset_t *, struct pt_regs *); extern int pmac_newworld; extern int sys_sigreturn(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f645adb57534..5af39f866735 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -1264,7 +1264,14 @@ static int __init early_init_dt_scan_memory(unsigned long node, unsigned long l; /* We are scanning "memory" nodes only */ - if (type == NULL || strcmp(type, "memory") != 0) + if (type == NULL) { + /* + * The longtrail doesn't have a device_type on the + * /memory node, so look for the node called /memory@0. + */ + if (depth != 1 || strcmp(uname, "memory@0") != 0) + return 0; + } else if (strcmp(type, "memory") != 0) return 0; reg = (cell_t *)of_get_flat_dt_prop(node, "reg", &l); diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 58f0917bd6b6..09db1bb9ec91 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -94,11 +94,17 @@ extern const struct linux_logo logo_linux_clut224; #ifdef CONFIG_PPC64 #define RELOC(x) (*PTRRELOC(&(x))) #define ADDR(x) (u32) add_reloc_offset((unsigned long)(x)) +#define OF_WORKAROUNDS 0 #else #define RELOC(x) (x) #define ADDR(x) (u32) (x) +#define OF_WORKAROUNDS of_workarounds +int of_workarounds; #endif +#define OF_WA_CLAIM 1 /* do phys/virt claim separately, then map */ +#define OF_WA_LONGTRAIL 2 /* work around longtrail bugs */ + #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ RELOC(__FILE__), __LINE__); \ @@ -128,10 +134,11 @@ struct prom_args { struct prom_t { ihandle root; - ihandle chosen; + phandle chosen; int cpu; ihandle stdout; ihandle mmumap; + ihandle memory; }; struct mem_map_entry { @@ -360,16 +367,36 @@ static void __init prom_printf(const char *format, ...) static unsigned int __init prom_claim(unsigned long virt, unsigned long size, unsigned long align) { - int ret; struct prom_t *_prom = &RELOC(prom); - ret = call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, - (prom_arg_t)align); - if (ret != -1 && _prom->mmumap != 0) - /* old pmacs need us to map as well */ + if (align == 0 && (OF_WORKAROUNDS & OF_WA_CLAIM)) { + /* + * Old OF requires we claim physical and virtual separately + * and then map explicitly (assuming virtual mode) + */ + int ret; + prom_arg_t result; + + ret = call_prom_ret("call-method", 5, 2, &result, + ADDR("claim"), _prom->memory, + align, size, virt); + if (ret != 0 || result == -1) + return -1; + ret = call_prom_ret("call-method", 5, 2, &result, + ADDR("claim"), _prom->mmumap, + align, size, virt); + if (ret != 0) { + call_prom("call-method", 4, 1, ADDR("release"), + _prom->memory, size, virt); + return -1; + } + /* the 0x12 is M (coherence) + PP == read/write */ call_prom("call-method", 6, 1, - ADDR("map"), _prom->mmumap, 0, size, virt, virt); - return ret; + ADDR("map"), _prom->mmumap, 0x12, size, virt, virt); + return virt; + } + return call_prom("claim", 3, 1, (prom_arg_t)virt, (prom_arg_t)size, + (prom_arg_t)align); } static void __init __attribute__((noreturn)) prom_panic(const char *reason) @@ -415,11 +442,52 @@ static int inline prom_getproplen(phandle node, const char *pname) return call_prom("getproplen", 2, 1, node, ADDR(pname)); } -static int inline prom_setprop(phandle node, const char *pname, - void *value, size_t valuelen) +static void add_string(char **str, const char *q) { - return call_prom("setprop", 4, 1, node, ADDR(pname), - (u32)(unsigned long) value, (u32) valuelen); + char *p = *str; + + while (*q) + *p++ = *q++; + *p++ = ' '; + *str = p; +} + +static char *tohex(unsigned int x) +{ + static char digits[] = "0123456789abcdef"; + static char result[9]; + int i; + + result[8] = 0; + i = 8; + do { + --i; + result[i] = digits[x & 0xf]; + x >>= 4; + } while (x != 0 && i > 0); + return &result[i]; +} + +static int __init prom_setprop(phandle node, const char *nodename, + const char *pname, void *value, size_t valuelen) +{ + char cmd[256], *p; + + if (!(OF_WORKAROUNDS & OF_WA_LONGTRAIL)) + return call_prom("setprop", 4, 1, node, ADDR(pname), + (u32)(unsigned long) value, (u32) valuelen); + + /* gah... setprop doesn't work on longtrail, have to use interpret */ + p = cmd; + add_string(&p, "dev"); + add_string(&p, nodename); + add_string(&p, tohex((u32)(unsigned long) value)); + add_string(&p, tohex(valuelen)); + add_string(&p, tohex(ADDR(pname))); + add_string(&p, tohex(strlen(RELOC(pname)))); + add_string(&p, "property"); + *p = 0; + return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd); } /* We can't use the standard versions because of RELOC headaches. */ @@ -980,7 +1048,7 @@ static void __init prom_instantiate_rtas(void) rtas_inst = call_prom("open", 1, 1, ADDR("/rtas")); if (!IHANDLE_VALID(rtas_inst)) { - prom_printf("opening rtas package failed"); + prom_printf("opening rtas package failed (%x)\n", rtas_inst); return; } @@ -988,7 +1056,7 @@ static void __init prom_instantiate_rtas(void) if (call_prom_ret("call-method", 3, 2, &entry, ADDR("instantiate-rtas"), - rtas_inst, base) == PROM_ERROR + rtas_inst, base) != 0 || entry == 0) { prom_printf(" failed\n"); return; @@ -997,8 +1065,10 @@ static void __init prom_instantiate_rtas(void) reserve_mem(base, size); - prom_setprop(rtas_node, "linux,rtas-base", &base, sizeof(base)); - prom_setprop(rtas_node, "linux,rtas-entry", &entry, sizeof(entry)); + prom_setprop(rtas_node, "/rtas", "linux,rtas-base", + &base, sizeof(base)); + prom_setprop(rtas_node, "/rtas", "linux,rtas-entry", + &entry, sizeof(entry)); prom_debug("rtas base = 0x%x\n", base); prom_debug("rtas entry = 0x%x\n", entry); @@ -1089,10 +1159,6 @@ static void __init prom_initialize_tce_table(void) if (base < local_alloc_bottom) local_alloc_bottom = base; - /* Save away the TCE table attributes for later use. */ - prom_setprop(node, "linux,tce-base", &base, sizeof(base)); - prom_setprop(node, "linux,tce-size", &minsize, sizeof(minsize)); - /* It seems OF doesn't null-terminate the path :-( */ memset(path, 0, sizeof(path)); /* Call OF to setup the TCE hardware */ @@ -1101,6 +1167,10 @@ static void __init prom_initialize_tce_table(void) prom_printf("package-to-path failed\n"); } + /* Save away the TCE table attributes for later use. */ + prom_setprop(node, path, "linux,tce-base", &base, sizeof(base)); + prom_setprop(node, path, "linux,tce-size", &minsize, sizeof(minsize)); + prom_debug("TCE table: %s\n", path); prom_debug("\tnode = 0x%x\n", node); prom_debug("\tbase = 0x%x\n", base); @@ -1342,6 +1412,7 @@ static void __init prom_init_client_services(unsigned long pp) /* * For really old powermacs, we need to map things we claim. * For that, we need the ihandle of the mmu. + * Also, on the longtrail, we need to work around other bugs. */ static void __init prom_find_mmu(void) { @@ -1355,12 +1426,19 @@ static void __init prom_find_mmu(void) if (prom_getprop(oprom, "model", version, sizeof(version)) <= 0) return; version[sizeof(version) - 1] = 0; - prom_printf("OF version is '%s'\n", version); /* XXX might need to add other versions here */ - if (strcmp(version, "Open Firmware, 1.0.5") != 0) + if (strcmp(version, "Open Firmware, 1.0.5") == 0) + of_workarounds = OF_WA_CLAIM; + else if (strncmp(version, "FirmWorks,3.", 12) == 0) { + of_workarounds = OF_WA_CLAIM | OF_WA_LONGTRAIL; + call_prom("interpret", 1, 1, "dev /memory 0 to allow-reclaim"); + } else return; + _prom->memory = call_prom("open", 1, 1, ADDR("/memory")); prom_getprop(_prom->chosen, "mmu", &_prom->mmumap, sizeof(_prom->mmumap)); + if (!IHANDLE_VALID(_prom->memory) || !IHANDLE_VALID(_prom->mmumap)) + of_workarounds &= ~OF_WA_CLAIM; /* hmmm */ } #else #define prom_find_mmu() @@ -1382,16 +1460,17 @@ static void __init prom_init_stdout(void) memset(path, 0, 256); call_prom("instance-to-path", 3, 1, _prom->stdout, path, 255); val = call_prom("instance-to-package", 1, 1, _prom->stdout); - prom_setprop(_prom->chosen, "linux,stdout-package", &val, sizeof(val)); + prom_setprop(_prom->chosen, "/chosen", "linux,stdout-package", + &val, sizeof(val)); prom_printf("OF stdout device is: %s\n", RELOC(of_stdout_device)); - prom_setprop(_prom->chosen, "linux,stdout-path", - RELOC(of_stdout_device), strlen(RELOC(of_stdout_device))+1); + prom_setprop(_prom->chosen, "/chosen", "linux,stdout-path", + path, strlen(path) + 1); /* If it's a display, note it */ memset(type, 0, sizeof(type)); prom_getprop(val, "device_type", type, sizeof(type)); if (strcmp(type, RELOC("display")) == 0) - prom_setprop(val, "linux,boot-display", NULL, 0); + prom_setprop(val, path, "linux,boot-display", NULL, 0); } static void __init prom_close_stdin(void) @@ -1514,7 +1593,7 @@ static void __init prom_check_displays(void) /* Success */ prom_printf("done\n"); - prom_setprop(node, "linux,opened", NULL, 0); + prom_setprop(node, path, "linux,opened", NULL, 0); /* Setup a usable color table when the appropriate * method is available. Should update this to set-colors */ @@ -1884,9 +1963,11 @@ static void __init fixup_device_tree(void) /* interrupt on this revision of u3 is number 0 and level */ interrupts[0] = 0; interrupts[1] = 1; - prom_setprop(i2c, "interrupts", &interrupts, sizeof(interrupts)); + prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupts", + &interrupts, sizeof(interrupts)); parent = (u32)mpic; - prom_setprop(i2c, "interrupt-parent", &parent, sizeof(parent)); + prom_setprop(i2c, "/u3@0,f8000000/i2c@f8001000", "interrupt-parent", + &parent, sizeof(parent)); #endif } @@ -1922,11 +2003,11 @@ static void __init prom_check_initrd(unsigned long r3, unsigned long r4) RELOC(prom_initrd_end) = RELOC(prom_initrd_start) + r4; val = RELOC(prom_initrd_start); - prom_setprop(_prom->chosen, "linux,initrd-start", &val, - sizeof(val)); + prom_setprop(_prom->chosen, "/chosen", "linux,initrd-start", + &val, sizeof(val)); val = RELOC(prom_initrd_end); - prom_setprop(_prom->chosen, "linux,initrd-end", &val, - sizeof(val)); + prom_setprop(_prom->chosen, "/chosen", "linux,initrd-end", + &val, sizeof(val)); reserve_mem(RELOC(prom_initrd_start), RELOC(prom_initrd_end) - RELOC(prom_initrd_start)); @@ -1969,14 +2050,15 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, prom_init_client_services(pp); /* - * Init prom stdout device + * See if this OF is old enough that we need to do explicit maps + * and other workarounds */ - prom_init_stdout(); + prom_find_mmu(); /* - * See if this OF is old enough that we need to do explicit maps + * Init prom stdout device */ - prom_find_mmu(); + prom_init_stdout(); /* * Check for an initrd @@ -1989,7 +2071,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, */ RELOC(of_platform) = prom_find_machine_type(); getprop_rval = RELOC(of_platform); - prom_setprop(_prom->chosen, "linux,platform", + prom_setprop(_prom->chosen, "/chosen", "linux,platform", &getprop_rval, sizeof(getprop_rval)); #ifdef CONFIG_PPC_PSERIES @@ -2050,21 +2132,23 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, * Fill in some infos for use by the kernel later on */ if (RELOC(prom_memory_limit)) - prom_setprop(_prom->chosen, "linux,memory-limit", + prom_setprop(_prom->chosen, "/chosen", "linux,memory-limit", &RELOC(prom_memory_limit), sizeof(prom_memory_limit)); #ifdef CONFIG_PPC64 if (RELOC(ppc64_iommu_off)) - prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0); + prom_setprop(_prom->chosen, "/chosen", "linux,iommu-off", + NULL, 0); if (RELOC(iommu_force_on)) - prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0); + prom_setprop(_prom->chosen, "/chosen", "linux,iommu-force-on", + NULL, 0); if (RELOC(prom_tce_alloc_start)) { - prom_setprop(_prom->chosen, "linux,tce-alloc-start", + prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-start", &RELOC(prom_tce_alloc_start), sizeof(prom_tce_alloc_start)); - prom_setprop(_prom->chosen, "linux,tce-alloc-end", + prom_setprop(_prom->chosen, "/chosen", "linux,tce-alloc-end", &RELOC(prom_tce_alloc_end), sizeof(prom_tce_alloc_end)); } diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 081d931eae48..a7c4515f320f 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -42,6 +42,7 @@ #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/sigcontext.h> #ifdef CONFIG_PPC64 #include "ppc32.h" #include <asm/unistd.h> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 32f215825e8d..0578f8387603 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -887,10 +887,6 @@ void altivec_unavailable_exception(struct pt_regs *regs) die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT); } -#ifdef CONFIG_PPC64 -extern perf_irq_t perf_irq; -#endif - #if defined(CONFIG_PPC64) || defined(CONFIG_E500) void performance_monitor_exception(struct pt_regs *regs) { diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 97082a4203ad..71a6addf9f7f 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -21,6 +21,7 @@ #include <asm/iommu.h> #include <asm/dma.h> #include <asm/vio.h> +#include <asm/prom.h> static const struct vio_device_id *vio_match_device( const struct vio_device_id *, const struct vio_dev *); @@ -265,7 +266,33 @@ static int vio_bus_match(struct device *dev, struct device_driver *drv) return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL); } +static int vio_hotplug(struct device *dev, char **envp, int num_envp, + char *buffer, int buffer_size) +{ + const struct vio_dev *vio_dev = to_vio_dev(dev); + char *cp; + int length; + + if (!num_envp) + return -ENOMEM; + + if (!vio_dev->dev.platform_data) + return -ENODEV; + cp = (char *)get_property(vio_dev->dev.platform_data, "compatible", &length); + if (!cp) + return -ENODEV; + + envp[0] = buffer; + length = scnprintf(buffer, buffer_size, "MODALIAS=vio:T%sS%s", + vio_dev->type, cp); + if (buffer_size - length <= 0) + return -ENOMEM; + envp[1] = NULL; + return 0; +} + struct bus_type vio_bus_type = { .name = "vio", + .hotplug = vio_hotplug, .match = vio_bus_match, }; diff --git a/arch/powerpc/lib/bitops.c b/arch/powerpc/lib/bitops.c index b67ce3004ebf..f68ad71a0187 100644 --- a/arch/powerpc/lib/bitops.c +++ b/arch/powerpc/lib/bitops.c @@ -41,7 +41,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, tmp = *p; found_first: - tmp &= (~0UL >> (64 - size)); + tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 900842451bd3..c7f7bb6f30b3 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -122,8 +122,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) * */ if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, - mmu_virtual_psize)) - panic("Can't map bolted IO mapping"); + mmu_virtual_psize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } } return 0; } diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index ecd32d5d85f4..4099ddab9205 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -361,7 +361,9 @@ static void __init chrp_find_openpic(void) printk(KERN_INFO "OpenPIC at %lx\n", opaddr); irq_count = NR_IRQS - NUM_ISA_INTERRUPTS - 4; /* leave room for IPIs */ - prom_get_irq_senses(init_senses, NUM_8259_INTERRUPTS, NR_IRQS - 4); + prom_get_irq_senses(init_senses, NUM_ISA_INTERRUPTS, NR_IRQS - 4); + /* i8259 cascade is always positive level */ + init_senses[0] = IRQ_SENSE_LEVEL | IRQ_POLARITY_POSITIVE; iranges = (unsigned int *) get_property(np, "interrupt-ranges", &len); if (iranges == NULL) diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index d3e4bf756c83..7f8f0cda6a74 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -694,20 +694,19 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } static void iseries_dedicated_idle(void) { long oldval; + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { while (!need_resched()) { ppc64_runlatch_off(); HMT_low(); @@ -720,13 +719,12 @@ static void iseries_dedicated_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } ppc64_runlatch_on(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index b9938fece781..27515476ad6c 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -3,3 +3,4 @@ obj-y := pci.o lpar.o hvCall.o nvram.o reconfig.o \ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_XICS) += xics.o +obj-$(CONFIG_EEH) += eeh.o eeh_event.o diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c new file mode 100644 index 000000000000..b760836bb9d1 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -0,0 +1,1197 @@ +/* + * eeh.c + * Copyright (C) 2001 Dave Engebretsen & Todd Inglett IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <asm/atomic.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> +#include <asm/systemcfg.h> + +#undef DEBUG + +/** Overview: + * EEH, or "Extended Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for EEH. + * An EEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. EEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of EEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with EEH. + * + * Ideally, a PCI device driver, when suspecting that an isolation + * event has occured (e.g. by reading 0xff's), will then ask EEH + * whether this is the case, and then take appropriate steps to + * reset the PCI slot, the PCI device, and then resume operations. + * However, until that day, the checking is done here, with the + * eeh_check_failure() routine embedded in the MMIO macros. If + * the slot is found to be isolated, an "EEH Event" is synthesized + * and sent out for processing. + */ + +/* If a device driver keeps reading an MMIO register in an interrupt + * handler after a slot isolation event has occurred, we assume it + * is broken and panic. This sets the threshold for how many read + * attempts we allow before panicking. + */ +#define EEH_MAX_FAILS 100000 + +/* Misc forward declaraions */ +static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn); + +/* RTAS tokens */ +static int ibm_set_eeh_option; +static int ibm_set_slot_reset; +static int ibm_read_slot_reset_state; +static int ibm_read_slot_reset_state2; +static int ibm_slot_error_detail; + +static int eeh_subsystem_enabled; + +/* Lock to avoid races due to multiple reports of an error */ +static DEFINE_SPINLOCK(confirm_error_lock); + +/* Buffer for reporting slot-error-detail rtas calls */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(slot_errbuf_lock); +static int eeh_error_buf_size; + +/* System monitoring statistics */ +static DEFINE_PER_CPU(unsigned long, no_device); +static DEFINE_PER_CPU(unsigned long, no_dn); +static DEFINE_PER_CPU(unsigned long, no_cfg_addr); +static DEFINE_PER_CPU(unsigned long, ignored_check); +static DEFINE_PER_CPU(unsigned long, total_mmio_ffs); +static DEFINE_PER_CPU(unsigned long, false_positives); +static DEFINE_PER_CPU(unsigned long, ignored_failures); +static DEFINE_PER_CPU(unsigned long, slot_resets); + +/** + * The pci address cache subsystem. This subsystem places + * PCI device address resources into a red-black tree, sorted + * according to the address range, so that given only an i/o + * address, the corresponding PCI device can be **quickly** + * found. It is safe to perform an address lookup in an interrupt + * context; this ability is an important feature. + * + * Currently, the only customer of this code is the EEH subsystem; + * thus, this code has been somewhat tailored to suit EEH better. + * In particular, the cache does *not* hold the addresses of devices + * for which EEH is not enabled. + * + * (Implementation Note: The RB tree seems to be better/faster + * than any hash algo I could think of for this problem, even + * with the penalty of slow pointer chases for d-cache misses). + */ +struct pci_io_addr_range +{ + struct rb_node rb_node; + unsigned long addr_lo; + unsigned long addr_hi; + struct pci_dev *pcidev; + unsigned int flags; +}; + +static struct pci_io_addr_cache +{ + struct rb_root rb_root; + spinlock_t piar_lock; +} pci_io_addr_cache_root; + +static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr) +{ + struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; + + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (addr < piar->addr_lo) { + n = n->rb_left; + } else { + if (addr > piar->addr_hi) { + n = n->rb_right; + } else { + pci_dev_get(piar->pcidev); + return piar->pcidev; + } + } + } + + return NULL; +} + +/** + * pci_get_device_by_addr - Get device, given only address + * @addr: mmio (PIO) phys address or i/o port number + * + * Given an mmio phys address, or a port number, find a pci device + * that implements this address. Be sure to pci_dev_put the device + * when finished. I/O port numbers are assumed to be offset + * from zero (that is, they do *not* have pci_io_addr added in). + * It is safe to call this function within an interrupt. + */ +static struct pci_dev *pci_get_device_by_addr(unsigned long addr) +{ + struct pci_dev *dev; + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + dev = __pci_get_device_by_addr(addr); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); + return dev; +} + +#ifdef DEBUG +/* + * Handy-dandy debug print routine, does nothing more + * than print out the contents of our addr cache. + */ +static void pci_addr_cache_print(struct pci_io_addr_cache *cache) +{ + struct rb_node *n; + int cnt = 0; + + n = rb_first(&cache->rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, + piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); + cnt++; + n = rb_next(n); + } +} +#endif + +/* Insert address range into the rb tree. */ +static struct pci_io_addr_range * +pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo, + unsigned long ahi, unsigned int flags) +{ + struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct pci_io_addr_range *piar; + + /* Walk tree, find a place to insert into tree */ + while (*p) { + parent = *p; + piar = rb_entry(parent, struct pci_io_addr_range, rb_node); + if (ahi < piar->addr_lo) { + p = &parent->rb_left; + } else if (alo > piar->addr_hi) { + p = &parent->rb_right; + } else { + if (dev != piar->pcidev || + alo != piar->addr_lo || ahi != piar->addr_hi) { + printk(KERN_WARNING "PIAR: overlapping address range\n"); + } + return piar; + } + } + piar = (struct pci_io_addr_range *)kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); + if (!piar) + return NULL; + + piar->addr_lo = alo; + piar->addr_hi = ahi; + piar->pcidev = dev; + piar->flags = flags; + +#ifdef DEBUG + printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name (dev)); +#endif + + rb_link_node(&piar->rb_node, parent, p); + rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); + + return piar; +} + +static void __pci_addr_cache_insert_device(struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_dn *pdn; + int i; + int inserted = 0; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev)); + return; + } + + /* Skip any devices for which EEH is not enabled. */ + pdn = PCI_DN(dn); + if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || + pdn->eeh_mode & EEH_MODE_NOCHECK) { +#ifdef DEBUG + printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n", + pci_name(dev), pdn->node->full_name); +#endif + return; + } + + /* The cache holds a reference to the device... */ + pci_dev_get(dev); + + /* Walk resources on this device, poke them into the tree */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + unsigned long start = pci_resource_start(dev,i); + unsigned long end = pci_resource_end(dev,i); + unsigned int flags = pci_resource_flags(dev,i); + + /* We are interested only bus addresses, not dma or other stuff */ + if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if (start == 0 || ~start == 0 || end == 0 || ~end == 0) + continue; + pci_addr_cache_insert(dev, start, end, flags); + inserted = 1; + } + + /* If there was nothing to add, the cache has no reference... */ + if (!inserted) + pci_dev_put(dev); +} + +/** + * pci_addr_cache_insert_device - Add a device to the address cache + * @dev: PCI device whose I/O addresses we are interested in. + * + * In order to support the fast lookup of devices based on addresses, + * we maintain a cache of devices that can be quickly searched. + * This routine adds a device to that cache. + */ +static void pci_addr_cache_insert_device(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __pci_addr_cache_insert_device(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +static inline void __pci_addr_cache_remove_device(struct pci_dev *dev) +{ + struct rb_node *n; + int removed = 0; + +restart: + n = rb_first(&pci_io_addr_cache_root.rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (piar->pcidev == dev) { + rb_erase(n, &pci_io_addr_cache_root.rb_root); + removed = 1; + kfree(piar); + goto restart; + } + n = rb_next(n); + } + + /* The cache no longer holds its reference to this device... */ + if (removed) + pci_dev_put(dev); +} + +/** + * pci_addr_cache_remove_device - remove pci device from addr cache + * @dev: device to remove + * + * Remove a device from the addr-cache tree. + * This is potentially expensive, since it will walk + * the tree multiple times (once per resource). + * But so what; device removal doesn't need to be that fast. + */ +static void pci_addr_cache_remove_device(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __pci_addr_cache_remove_device(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +/** + * pci_addr_cache_build - Build a cache of I/O addresses + * + * Build a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + * This routine scans all pci busses to build the cache. + * Must be run late in boot process, after the pci controllers + * have been scaned for devices (after all device resources are known). + */ +void __init pci_addr_cache_build(void) +{ + struct device_node *dn; + struct pci_dev *dev = NULL; + + if (!eeh_subsystem_enabled) + return; + + spin_lock_init(&pci_io_addr_cache_root.piar_lock); + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + /* Ignore PCI bridges ( XXX why ??) */ + if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) { + continue; + } + pci_addr_cache_insert_device(dev); + + /* Save the BAR's; firmware doesn't restore these after EEH reset */ + dn = pci_device_to_OF_node(dev); + eeh_save_bars(dev, PCI_DN(dn)); + } + +#ifdef DEBUG + /* Verify tree built up above, echo back the list of addrs. */ + pci_addr_cache_print(&pci_io_addr_cache_root); +#endif +} + +/* --------------------------------------------------------------- */ +/* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */ + +void eeh_slot_error_detail (struct pci_dn *pdn, int severity) +{ + unsigned long flags; + int rc; + + /* Log the error with the rtas logger */ + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); + + rc = rtas_call(ibm_slot_error_detail, + 8, 1, NULL, pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), NULL, 0, + virt_to_phys(slot_errbuf), + eeh_error_buf_size, + severity); + + if (rc == 0) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); +} + +/** + * read_slot_reset_state - Read the reset state of a device node's slot + * @dn: device node to read + * @rets: array to return results in + */ +static int read_slot_reset_state(struct pci_dn *pdn, int rets[]) +{ + int token, outputs; + + if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { + token = ibm_read_slot_reset_state2; + outputs = 4; + } else { + token = ibm_read_slot_reset_state; + rets[2] = 0; /* fake PE Unavailable info */ + outputs = 3; + } + + return rtas_call(token, 3, outputs, rets, pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); +} + +/** + * eeh_token_to_phys - convert EEH address token to phys address + * @token i/o token, should be address in the form 0xA.... + */ +static inline unsigned long eeh_token_to_phys(unsigned long token) +{ + pte_t *ptep; + unsigned long pa; + + ptep = find_linux_pte(init_mm.pgd, token); + if (!ptep) + return token; + pa = pte_pfn(*ptep) << PAGE_SHIFT; + + return pa | (token & (PAGE_SIZE-1)); +} + +/** + * Return the "partitionable endpoint" (pe) under which this device lies + */ +static struct device_node * find_device_pe(struct device_node *dn) +{ + while ((dn->parent) && PCI_DN(dn->parent) && + (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { + dn = dn->parent; + } + return dn; +} + +/** Mark all devices that are peers of this device as failed. + * Mark the device driver too, so that it can see the failure + * immediately; this is critical, since some drivers poll + * status registers in interrupts ... If a driver is polling, + * and the slot is frozen, then the driver can deadlock in + * an interrupt context, which is bad. + */ + +static inline void __eeh_mark_slot (struct device_node *dn) +{ + while (dn) { + PCI_DN(dn)->eeh_mode |= EEH_MODE_ISOLATED; + + if (dn->child) + __eeh_mark_slot (dn->child); + dn = dn->sibling; + } +} + +static inline void __eeh_clear_slot (struct device_node *dn) +{ + while (dn) { + PCI_DN(dn)->eeh_mode &= ~EEH_MODE_ISOLATED; + if (dn->child) + __eeh_clear_slot (dn->child); + dn = dn->sibling; + } +} + +static inline void eeh_clear_slot (struct device_node *dn) +{ + unsigned long flags; + spin_lock_irqsave(&confirm_error_lock, flags); + __eeh_clear_slot (dn); + spin_unlock_irqrestore(&confirm_error_lock, flags); +} + +/** + * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze + * @dn device node + * @dev pci device, if known + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze. This routine + * will query firmware for the EEH status. + * + * Returns 0 if there has not been an EEH error; otherwise returns + * a non-zero value and queues up a slot isolation event notification. + * + * It is safe to call this routine in an interrupt context. + */ +int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) +{ + int ret; + int rets[3]; + unsigned long flags; + struct pci_dn *pdn; + struct device_node *pe_dn; + int rc = 0; + + __get_cpu_var(total_mmio_ffs)++; + + if (!eeh_subsystem_enabled) + return 0; + + if (!dn) { + __get_cpu_var(no_dn)++; + return 0; + } + pdn = PCI_DN(dn); + + /* Access to IO BARs might get this far and still not want checking. */ + if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || + pdn->eeh_mode & EEH_MODE_NOCHECK) { + __get_cpu_var(ignored_check)++; +#ifdef DEBUG + printk ("EEH:ignored check (%x) for %s %s\n", + pdn->eeh_mode, pci_name (dev), dn->full_name); +#endif + return 0; + } + + if (!pdn->eeh_config_addr) { + __get_cpu_var(no_cfg_addr)++; + return 0; + } + + /* If we already have a pending isolation event for this + * slot, we know it's bad already, we don't need to check. + * Do this checking under a lock; as multiple PCI devices + * in one slot might report errors simultaneously, and we + * only want one error recovery routine running. + */ + spin_lock_irqsave(&confirm_error_lock, flags); + rc = 1; + if (pdn->eeh_mode & EEH_MODE_ISOLATED) { + pdn->eeh_check_count ++; + if (pdn->eeh_check_count >= EEH_MAX_FAILS) { + printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", + pdn->eeh_check_count); + dump_stack(); + + /* re-read the slot reset state */ + if (read_slot_reset_state(pdn, rets) != 0) + rets[0] = -1; /* reset state unknown */ + + /* If we are here, then we hit an infinite loop. Stop. */ + panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev)); + } + goto dn_unlock; + } + + /* + * Now test for an EEH failure. This is VERY expensive. + * Note that the eeh_config_addr may be a parent device + * in the case of a device behind a bridge, or it may be + * function zero of a multi-function device. + * In any case they must share a common PHB. + */ + ret = read_slot_reset_state(pdn, rets); + + /* If the call to firmware failed, punt */ + if (ret != 0) { + printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", + ret, dn->full_name); + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + /* If EEH is not supported on this device, punt. */ + if (rets[1] != 1) { + printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", + ret, dn->full_name); + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + /* If not the kind of error we know about, punt. */ + if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + /* Note that config-io to empty slots may fail; + * we recognize empty because they don't have children. */ + if ((rets[0] == 5) && (dn->child == NULL)) { + __get_cpu_var(false_positives)++; + rc = 0; + goto dn_unlock; + } + + __get_cpu_var(slot_resets)++; + + /* Avoid repeated reports of this failure, including problems + * with other functions on this device, and functions under + * bridges. */ + pe_dn = find_device_pe (dn); + __eeh_mark_slot (pe_dn); + spin_unlock_irqrestore(&confirm_error_lock, flags); + + eeh_send_failure_event (dn, dev, rets[0], rets[2]); + + /* Most EEH events are due to device driver bugs. Having + * a stack trace will help the device-driver authors figure + * out what happened. So print that out. */ + if (rets[0] != 5) dump_stack(); + return 1; + +dn_unlock: + spin_unlock_irqrestore(&confirm_error_lock, flags); + return rc; +} + +EXPORT_SYMBOL_GPL(eeh_dn_check_failure); + +/** + * eeh_check_failure - check if all 1's data is due to EEH slot freeze + * @token i/o token, should be address in the form 0xA.... + * @val value, should be all 1's (XXX why do we need this arg??) + * + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) +{ + unsigned long addr; + struct pci_dev *dev; + struct device_node *dn; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long __force) token); + dev = pci_get_device_by_addr(addr); + if (!dev) { + __get_cpu_var(no_device)++; + return val; + } + + dn = pci_device_to_OF_node(dev); + eeh_dn_check_failure (dn, dev); + + pci_dev_put(dev); + return val; +} + +EXPORT_SYMBOL(eeh_check_failure); + +/* ------------------------------------------------------------- */ +/* The code below deals with error recovery */ + +/** Return negative value if a permanent error, else return + * a number of milliseconds to wait until the PCI slot is + * ready to be used. + */ +static int +eeh_slot_availability(struct pci_dn *pdn) +{ + int rc; + int rets[3]; + + rc = read_slot_reset_state(pdn, rets); + + if (rc) return rc; + + if (rets[1] == 0) return -1; /* EEH is not supported */ + if (rets[0] == 0) return 0; /* Oll Korrect */ + if (rets[0] == 5) { + if (rets[2] == 0) return -1; /* permanently unavailable */ + return rets[2]; /* number of millisecs to wait */ + } + return -1; +} + +/** rtas_pci_slot_reset raises/lowers the pci #RST line + * state: 1/0 to raise/lower the #RST + * + * Clear the EEH-frozen condition on a slot. This routine + * asserts the PCI #RST line if the 'state' argument is '1', + * and drops the #RST line if 'state is '0'. This routine is + * safe to call in an interrupt context. + * + */ + +static void +rtas_pci_slot_reset(struct pci_dn *pdn, int state) +{ + int rc; + + BUG_ON (pdn==NULL); + + if (!pdn->phb) { + printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n", + pdn->node->full_name); + return; + } + + rc = rtas_call(ibm_set_slot_reset,4,1, NULL, + pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), + state); + if (rc) { + printk (KERN_WARNING "EEH: Unable to reset the failed slot, (%d) #RST=%d dn=%s\n", + rc, state, pdn->node->full_name); + return; + } + + if (state == 0) + eeh_clear_slot (pdn->node->parent->child); +} + +/** rtas_set_slot_reset -- assert the pci #RST line for 1/4 second + * dn -- device node to be reset. + */ + +void +rtas_set_slot_reset(struct pci_dn *pdn) +{ + int i, rc; + + rtas_pci_slot_reset (pdn, 1); + + /* The PCI bus requires that the reset be held high for at least + * a 100 milliseconds. We wait a bit longer 'just in case'. */ + +#define PCI_BUS_RST_HOLD_TIME_MSEC 250 + msleep (PCI_BUS_RST_HOLD_TIME_MSEC); + rtas_pci_slot_reset (pdn, 0); + + /* After a PCI slot has been reset, the PCI Express spec requires + * a 1.5 second idle time for the bus to stabilize, before starting + * up traffic. */ +#define PCI_BUS_SETTLE_TIME_MSEC 1800 + msleep (PCI_BUS_SETTLE_TIME_MSEC); + + /* Now double check with the firmware to make sure the device is + * ready to be used; if not, wait for recovery. */ + for (i=0; i<10; i++) { + rc = eeh_slot_availability (pdn); + if (rc <= 0) break; + + msleep (rc+100); + } +} + +/* ------------------------------------------------------- */ +/** Save and restore of PCI BARs + * + * Although firmware will set up BARs during boot, it doesn't + * set up device BAR's after a device reset, although it will, + * if requested, set up bridge configuration. Thus, we need to + * configure the PCI devices ourselves. + */ + +/** + * __restore_bars - Restore the Base Address Registers + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static inline void __restore_bars (struct pci_dn *pdn) +{ + int i; + + if (NULL==pdn->phb) return; + for (i=4; i<10; i++) { + rtas_write_config(pdn, i*4, 4, pdn->config_space[i]); + } + + /* 12 == Expansion ROM Address */ + rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]); + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)]) + + rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + + rtas_write_config (pdn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]); +} + +/** + * eeh_restore_bars - restore the PCI config space info + * + * This routine performs a recursive walk to the children + * of this device as well. + */ +void eeh_restore_bars(struct pci_dn *pdn) +{ + struct device_node *dn; + if (!pdn) + return; + + if (! pdn->eeh_is_bridge) + __restore_bars (pdn); + + dn = pdn->node->child; + while (dn) { + eeh_restore_bars (PCI_DN(dn)); + dn = dn->sibling; + } +} + +/** + * eeh_save_bars - save device bars + * + * Save the values of the device bars. Unlike the restore + * routine, this routine is *not* recursive. This is because + * PCI devices are added individuallly; but, for the restore, + * an entire slot is reset at a time. + */ +static void eeh_save_bars(struct pci_dev * pdev, struct pci_dn *pdn) +{ + int i; + + if (!pdev || !pdn ) + return; + + for (i = 0; i < 16; i++) + pci_read_config_dword(pdev, i * 4, &pdn->config_space[i]); + + if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) + pdn->eeh_is_bridge = 1; +} + +void +rtas_configure_bridge(struct pci_dn *pdn) +{ + int token = rtas_token ("ibm,configure-bridge"); + int rc; + + if (token == RTAS_UNKNOWN_SERVICE) + return; + rc = rtas_call(token,3,1, NULL, + pdn->eeh_config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid)); + if (rc) { + printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", + rc, pdn->node->full_name); + } +} + +/* ------------------------------------------------------------- */ +/* The code below deals with enabling EEH for devices during the + * early boot sequence. EEH must be enabled before any PCI probing + * can be done. + */ + +#define EEH_ENABLE 1 + +struct eeh_early_enable_info { + unsigned int buid_hi; + unsigned int buid_lo; +}; + +/* Enable eeh for the given device node. */ +static void *early_enable_eeh(struct device_node *dn, void *data) +{ + struct eeh_early_enable_info *info = data; + int ret; + char *status = get_property(dn, "status", NULL); + u32 *class_code = (u32 *)get_property(dn, "class-code", NULL); + u32 *vendor_id = (u32 *)get_property(dn, "vendor-id", NULL); + u32 *device_id = (u32 *)get_property(dn, "device-id", NULL); + u32 *regs; + int enable; + struct pci_dn *pdn = PCI_DN(dn); + + pdn->eeh_mode = 0; + pdn->eeh_check_count = 0; + pdn->eeh_freeze_count = 0; + + if (status && strcmp(status, "ok") != 0) + return NULL; /* ignore devices with bad status */ + + /* Ignore bad nodes. */ + if (!class_code || !vendor_id || !device_id) + return NULL; + + /* There is nothing to check on PCI to ISA bridges */ + if (dn->type && !strcmp(dn->type, "isa")) { + pdn->eeh_mode |= EEH_MODE_NOCHECK; + return NULL; + } + + /* + * Now decide if we are going to "Disable" EEH checking + * for this device. We still run with the EEH hardware active, + * but we won't be checking for ff's. This means a driver + * could return bad data (very bad!), an interrupt handler could + * hang waiting on status bits that won't change, etc. + * But there are a few cases like display devices that make sense. + */ + enable = 1; /* i.e. we will do checking */ + if ((*class_code >> 16) == PCI_BASE_CLASS_DISPLAY) + enable = 0; + + if (!enable) + pdn->eeh_mode |= EEH_MODE_NOCHECK; + + /* Ok... see if this device supports EEH. Some do, some don't, + * and the only way to find out is to check each and every one. */ + regs = (u32 *)get_property(dn, "reg", NULL); + if (regs) { + /* First register entry is addr (00BBSS00) */ + /* Try to enable eeh */ + ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, + regs[0], info->buid_hi, info->buid_lo, + EEH_ENABLE); + + if (ret == 0) { + eeh_subsystem_enabled = 1; + pdn->eeh_mode |= EEH_MODE_SUPPORTED; + pdn->eeh_config_addr = regs[0]; +#ifdef DEBUG + printk(KERN_DEBUG "EEH: %s: eeh enabled\n", dn->full_name); +#endif + } else { + + /* This device doesn't support EEH, but it may have an + * EEH parent, in which case we mark it as supported. */ + if (dn->parent && PCI_DN(dn->parent) + && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { + /* Parent supports EEH. */ + pdn->eeh_mode |= EEH_MODE_SUPPORTED; + pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr; + return NULL; + } + } + } else { + printk(KERN_WARNING "EEH: %s: unable to get reg property.\n", + dn->full_name); + } + + return NULL; +} + +/* + * Initialize EEH by trying to enable it for all of the adapters in the system. + * As a side effect we can determine here if eeh is supported at all. + * Note that we leave EEH on so failed config cycles won't cause a machine + * check. If a user turns off EEH for a particular adapter they are really + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. + */ +void __init eeh_init(void) +{ + struct device_node *phb, *np; + struct eeh_early_enable_info info; + + spin_lock_init(&confirm_error_lock); + spin_lock_init(&slot_errbuf_lock); + + np = of_find_node_by_path("/rtas"); + if (np == NULL) + return; + + ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); + ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); + + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) + return; + + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + eeh_error_buf_size = 1024; + } + if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated " + "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + + /* Enable EEH for all adapters. Note that eeh requires buid's */ + for (phb = of_find_node_by_name(NULL, "pci"); phb; + phb = of_find_node_by_name(phb, "pci")) { + unsigned long buid; + + buid = get_phb_buid(phb); + if (buid == 0 || PCI_DN(phb) == NULL) + continue; + + info.buid_lo = BUID_LO(buid); + info.buid_hi = BUID_HI(buid); + traverse_pci_devices(phb, early_enable_eeh, &info); + } + + if (eeh_subsystem_enabled) + printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + printk(KERN_WARNING "EEH: No capable adapters found\n"); +} + +/** + * eeh_add_device_early - enable EEH for the indicated device_node + * @dn: device node for which to set up EEH + * + * This routine must be used to perform EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + * This routine must be called before any i/o is performed to the + * adapter (inluding any config-space i/o). + * Whether this actually enables EEH or not for this device depends + * on the CEC architecture, type of the device, on earlier boot + * command-line arguments & etc. + */ +void eeh_add_device_early(struct device_node *dn) +{ + struct pci_controller *phb; + struct eeh_early_enable_info info; + + if (!dn || !PCI_DN(dn)) + return; + phb = PCI_DN(dn)->phb; + if (NULL == phb || 0 == phb->buid) { + printk(KERN_WARNING "EEH: Expected buid but found none for %s\n", + dn->full_name); + dump_stack(); + return; + } + + info.buid_hi = BUID_HI(phb->buid); + info.buid_lo = BUID_LO(phb->buid); + early_enable_eeh(dn, &info); +} +EXPORT_SYMBOL_GPL(eeh_add_device_early); + +/** + * eeh_add_device_late - perform EEH initialization for the indicated pci device + * @dev: pci device for which to set up EEH + * + * This routine must be used to complete EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + */ +void eeh_add_device_late(struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_dn *pdn; + + if (!dev || !eeh_subsystem_enabled) + return; + +#ifdef DEBUG + printk(KERN_DEBUG "EEH: adding device %s\n", pci_name(dev)); +#endif + + pci_dev_get (dev); + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + pdn->pcidev = dev; + + pci_addr_cache_insert_device (dev); + eeh_save_bars(dev, pdn); +} +EXPORT_SYMBOL_GPL(eeh_add_device_late); + +/** + * eeh_remove_device - undo EEH setup for the indicated pci device + * @dev: pci device to be removed + * + * This routine should be when a device is removed from a running + * system (e.g. by hotplug or dlpar). + */ +void eeh_remove_device(struct pci_dev *dev) +{ + struct device_node *dn; + if (!dev || !eeh_subsystem_enabled) + return; + + /* Unregister the device with the EEH/PCI address search system */ +#ifdef DEBUG + printk(KERN_DEBUG "EEH: remove device %s\n", pci_name(dev)); +#endif + pci_addr_cache_remove_device(dev); + + dn = pci_device_to_OF_node(dev); + PCI_DN(dn)->pcidev = NULL; + pci_dev_put (dev); +} +EXPORT_SYMBOL_GPL(eeh_remove_device); + +static int proc_eeh_show(struct seq_file *m, void *v) +{ + unsigned int cpu; + unsigned long ffs = 0, positives = 0, failures = 0; + unsigned long resets = 0; + unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0; + + for_each_cpu(cpu) { + ffs += per_cpu(total_mmio_ffs, cpu); + positives += per_cpu(false_positives, cpu); + failures += per_cpu(ignored_failures, cpu); + resets += per_cpu(slot_resets, cpu); + no_dev += per_cpu(no_device, cpu); + no_dn += per_cpu(no_dn, cpu); + no_cfg += per_cpu(no_cfg_addr, cpu); + no_check += per_cpu(ignored_check, cpu); + } + + if (0 == eeh_subsystem_enabled) { + seq_printf(m, "EEH Subsystem is globally disabled\n"); + seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs); + } else { + seq_printf(m, "EEH Subsystem is enabled\n"); + seq_printf(m, + "no device=%ld\n" + "no device node=%ld\n" + "no config address=%ld\n" + "check not wanted=%ld\n" + "eeh_total_mmio_ffs=%ld\n" + "eeh_false_positives=%ld\n" + "eeh_ignored_failures=%ld\n" + "eeh_slot_resets=%ld\n", + no_dev, no_dn, no_cfg, no_check, + ffs, positives, failures, resets); + } + + return 0; +} + +static int proc_eeh_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_eeh_show, NULL); +} + +static struct file_operations proc_eeh_operations = { + .open = proc_eeh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init eeh_init_proc(void) +{ + struct proc_dir_entry *e; + + if (systemcfg->platform & PLATFORM_PSERIES) { + e = create_proc_entry("ppc64/eeh", 0, NULL); + if (e) + e->proc_fops = &proc_eeh_operations; + } + + return 0; +} +__initcall(eeh_init_proc); diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c new file mode 100644 index 000000000000..92497333c2b6 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -0,0 +1,155 @@ +/* + * eeh_event.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2005 Linas Vepstas <linas@linas.org> + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <asm/eeh_event.h> + +/** Overview: + * EEH error states may be detected within exception handlers; + * however, the recovery processing needs to occur asynchronously + * in a normal kernel context and not an interrupt context. + * This pair of routines creates an event and queues it onto a + * work-queue, where a worker thread can drive recovery. + */ + +/* EEH event workqueue setup. */ +static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(eeh_eventlist); +static void eeh_thread_launcher(void *); +DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL); + +/** + * eeh_panic - call panic() for an eeh event that cannot be handled. + * The philosophy of this routine is that it is better to panic and + * halt the OS than it is to risk possible data corruption by + * oblivious device drivers that don't know better. + * + * @dev pci device that had an eeh event + * @reset_state current reset state of the device slot + */ +static void eeh_panic(struct pci_dev *dev, int reset_state) +{ + /* + * Since the panic_on_oops sysctl is used to halt the system + * in light of potential corruption, we can use it here. + */ + if (panic_on_oops) { + panic("EEH: MMIO failure (%d) on device:%s\n", reset_state, + pci_name(dev)); + } + else { + printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n", + reset_state, pci_name(dev)); + } +} + +/** + * eeh_event_handler - dispatch EEH events. The detection of a frozen + * slot can occur inside an interrupt, where it can be hard to do + * anything about it. The goal of this routine is to pull these + * detection events out of the context of the interrupt handler, and + * re-dispatch them for processing at a later time in a normal context. + * + * @dummy - unused + */ +static int eeh_event_handler(void * dummy) +{ + unsigned long flags; + struct eeh_event *event; + + daemonize ("eehd"); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (event == NULL) + break; + + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", + pci_name(event->dev)); + + eeh_panic (event->dev, event->state); + + kfree(event); + } + + return 0; +} + +/** + * eeh_thread_launcher + * + * @dummy - unused + */ +static void eeh_thread_launcher(void *dummy) +{ + if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0) + printk(KERN_ERR "Failed to start EEH daemon\n"); +} + +/** + * eeh_send_failure_event - generate a PCI error event + * @dev pci device + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int eeh_send_failure_event (struct device_node *dn, + struct pci_dev *dev, + int state, + int time_unavail) +{ + unsigned long flags; + struct eeh_event *event; + + event = kmalloc(sizeof(*event), GFP_ATOMIC); + if (event == NULL) { + printk (KERN_ERR "EEH: out of memory, event not handled\n"); + return 1; + } + + if (dev) + pci_dev_get(dev); + + event->dn = dn; + event->dev = dev; + event->state = state; + event->time_unavail = time_unavail; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_add(&event->list, &eeh_eventlist); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + schedule_work(&eeh_event_wq); + + return 0; +} + +/********************** END OF FILE ******************************/ diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index e78c39368841..a093a0d4dd69 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -469,6 +469,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu) * more. */ clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); /* * SMT dynamic mode. Cede will result in this thread going @@ -481,6 +482,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu) cede_processor(); else local_irq_enable(); + set_thread_flag(TIF_POLLING_NRFLAG); } else { /* * Give the HV an opportunity at the processor, since we are @@ -492,11 +494,11 @@ static inline void dedicated_idle_sleep(unsigned int cpu) static void pseries_dedicated_idle(void) { - long oldval; struct paca_struct *lpaca = get_paca(); unsigned int cpu = smp_processor_id(); unsigned long start_snooze; unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay); + set_thread_flag(TIF_POLLING_NRFLAG); while (1) { /* @@ -505,10 +507,7 @@ static void pseries_dedicated_idle(void) */ lpaca->lppaca.idle = 1; - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - + if (!need_resched()) { start_snooze = __get_tb() + *smt_snooze_delay * tb_ticks_per_usec; @@ -531,15 +530,14 @@ static void pseries_dedicated_idle(void) } HMT_medium(); - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); } lpaca->lppaca.idle = 0; ppc64_runlatch_on(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) cpu_die(); @@ -583,7 +581,9 @@ static void pseries_shared_idle(void) lpaca->lppaca.idle = 0; ppc64_runlatch_on(); + preempt_enable_no_resched(); schedule(); + preempt_disable(); if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING) cpu_die(); |