diff options
Diffstat (limited to 'arch/powerpc/kernel')
93 files changed, 2161 insertions, 2044 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 4aa7c147e447..1b6bc7fba996 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the linux kernel. # @@ -38,7 +39,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o @@ -83,7 +84,7 @@ extra-y := head_$(BITS).o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o -extra-$(CONFIG_8xx) := head_8xx.o +extra-$(CONFIG_PPC_8xx) := head_8xx.o extra-y += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o @@ -128,7 +129,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),) +ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),) obj-y += ppc_save_regs.o endif diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index ec7a8b099dd9..3e6c0744c174 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -27,6 +27,7 @@ #include <asm/switch_to.h> #include <asm/disassemble.h> #include <asm/cpu_has_feature.h> +#include <asm/sstep.h> struct aligninfo { unsigned char len; @@ -40,364 +41,9 @@ struct aligninfo { #define LD 0 /* load */ #define ST 1 /* store */ #define SE 2 /* sign-extend value, or FP ld/st as word */ -#define F 4 /* to/from fp regs */ -#define U 8 /* update index register */ -#define M 0x10 /* multiple load/store */ #define SW 0x20 /* byte swap */ -#define S 0x40 /* single-precision fp or... */ -#define SX 0x40 /* ... byte count in XER */ -#define HARD 0x80 /* string, stwcx. */ #define E4 0x40 /* SPE endianness is word */ #define E8 0x80 /* SPE endianness is double word */ -#define SPLT 0x80 /* VSX SPLAT load */ - -/* DSISR bits reported for a DCBZ instruction: */ -#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */ - -/* - * The PowerPC stores certain bits of the instruction that caused the - * alignment exception in the DSISR register. This array maps those - * bits to information about the operand length and what the - * instruction would do. - */ -static struct aligninfo aligninfo[128] = { - { 4, LD }, /* 00 0 0000: lwz / lwarx */ - INVALID, /* 00 0 0001 */ - { 4, ST }, /* 00 0 0010: stw */ - INVALID, /* 00 0 0011 */ - { 2, LD }, /* 00 0 0100: lhz */ - { 2, LD+SE }, /* 00 0 0101: lha */ - { 2, ST }, /* 00 0 0110: sth */ - { 4, LD+M }, /* 00 0 0111: lmw */ - { 4, LD+F+S }, /* 00 0 1000: lfs */ - { 8, LD+F }, /* 00 0 1001: lfd */ - { 4, ST+F+S }, /* 00 0 1010: stfs */ - { 8, ST+F }, /* 00 0 1011: stfd */ - { 16, LD }, /* 00 0 1100: lq */ - { 8, LD }, /* 00 0 1101: ld/ldu/lwa */ - INVALID, /* 00 0 1110 */ - { 8, ST }, /* 00 0 1111: std/stdu */ - { 4, LD+U }, /* 00 1 0000: lwzu */ - INVALID, /* 00 1 0001 */ - { 4, ST+U }, /* 00 1 0010: stwu */ - INVALID, /* 00 1 0011 */ - { 2, LD+U }, /* 00 1 0100: lhzu */ - { 2, LD+SE+U }, /* 00 1 0101: lhau */ - { 2, ST+U }, /* 00 1 0110: sthu */ - { 4, ST+M }, /* 00 1 0111: stmw */ - { 4, LD+F+S+U }, /* 00 1 1000: lfsu */ - { 8, LD+F+U }, /* 00 1 1001: lfdu */ - { 4, ST+F+S+U }, /* 00 1 1010: stfsu */ - { 8, ST+F+U }, /* 00 1 1011: stfdu */ - { 16, LD+F }, /* 00 1 1100: lfdp */ - INVALID, /* 00 1 1101 */ - { 16, ST+F }, /* 00 1 1110: stfdp */ - INVALID, /* 00 1 1111 */ - { 8, LD }, /* 01 0 0000: ldx */ - INVALID, /* 01 0 0001 */ - { 8, ST }, /* 01 0 0010: stdx */ - INVALID, /* 01 0 0011 */ - INVALID, /* 01 0 0100 */ - { 4, LD+SE }, /* 01 0 0101: lwax */ - INVALID, /* 01 0 0110 */ - INVALID, /* 01 0 0111 */ - { 4, LD+M+HARD+SX }, /* 01 0 1000: lswx */ - { 4, LD+M+HARD }, /* 01 0 1001: lswi */ - { 4, ST+M+HARD+SX }, /* 01 0 1010: stswx */ - { 4, ST+M+HARD }, /* 01 0 1011: stswi */ - INVALID, /* 01 0 1100 */ - { 8, LD+U }, /* 01 0 1101: ldu */ - INVALID, /* 01 0 1110 */ - { 8, ST+U }, /* 01 0 1111: stdu */ - { 8, LD+U }, /* 01 1 0000: ldux */ - INVALID, /* 01 1 0001 */ - { 8, ST+U }, /* 01 1 0010: stdux */ - INVALID, /* 01 1 0011 */ - INVALID, /* 01 1 0100 */ - { 4, LD+SE+U }, /* 01 1 0101: lwaux */ - INVALID, /* 01 1 0110 */ - INVALID, /* 01 1 0111 */ - INVALID, /* 01 1 1000 */ - INVALID, /* 01 1 1001 */ - INVALID, /* 01 1 1010 */ - INVALID, /* 01 1 1011 */ - INVALID, /* 01 1 1100 */ - INVALID, /* 01 1 1101 */ - INVALID, /* 01 1 1110 */ - INVALID, /* 01 1 1111 */ - INVALID, /* 10 0 0000 */ - INVALID, /* 10 0 0001 */ - INVALID, /* 10 0 0010: stwcx. */ - INVALID, /* 10 0 0011 */ - INVALID, /* 10 0 0100 */ - INVALID, /* 10 0 0101 */ - INVALID, /* 10 0 0110 */ - INVALID, /* 10 0 0111 */ - { 4, LD+SW }, /* 10 0 1000: lwbrx */ - INVALID, /* 10 0 1001 */ - { 4, ST+SW }, /* 10 0 1010: stwbrx */ - INVALID, /* 10 0 1011 */ - { 2, LD+SW }, /* 10 0 1100: lhbrx */ - { 4, LD+SE }, /* 10 0 1101 lwa */ - { 2, ST+SW }, /* 10 0 1110: sthbrx */ - { 16, ST }, /* 10 0 1111: stq */ - INVALID, /* 10 1 0000 */ - INVALID, /* 10 1 0001 */ - INVALID, /* 10 1 0010 */ - INVALID, /* 10 1 0011 */ - INVALID, /* 10 1 0100 */ - INVALID, /* 10 1 0101 */ - INVALID, /* 10 1 0110 */ - INVALID, /* 10 1 0111 */ - INVALID, /* 10 1 1000 */ - INVALID, /* 10 1 1001 */ - INVALID, /* 10 1 1010 */ - INVALID, /* 10 1 1011 */ - INVALID, /* 10 1 1100 */ - INVALID, /* 10 1 1101 */ - INVALID, /* 10 1 1110 */ - { 0, ST+HARD }, /* 10 1 1111: dcbz */ - { 4, LD }, /* 11 0 0000: lwzx */ - INVALID, /* 11 0 0001 */ - { 4, ST }, /* 11 0 0010: stwx */ - INVALID, /* 11 0 0011 */ - { 2, LD }, /* 11 0 0100: lhzx */ - { 2, LD+SE }, /* 11 0 0101: lhax */ - { 2, ST }, /* 11 0 0110: sthx */ - INVALID, /* 11 0 0111 */ - { 4, LD+F+S }, /* 11 0 1000: lfsx */ - { 8, LD+F }, /* 11 0 1001: lfdx */ - { 4, ST+F+S }, /* 11 0 1010: stfsx */ - { 8, ST+F }, /* 11 0 1011: stfdx */ - { 16, LD+F }, /* 11 0 1100: lfdpx */ - { 4, LD+F+SE }, /* 11 0 1101: lfiwax */ - { 16, ST+F }, /* 11 0 1110: stfdpx */ - { 4, ST+F }, /* 11 0 1111: stfiwx */ - { 4, LD+U }, /* 11 1 0000: lwzux */ - INVALID, /* 11 1 0001 */ - { 4, ST+U }, /* 11 1 0010: stwux */ - INVALID, /* 11 1 0011 */ - { 2, LD+U }, /* 11 1 0100: lhzux */ - { 2, LD+SE+U }, /* 11 1 0101: lhaux */ - { 2, ST+U }, /* 11 1 0110: sthux */ - INVALID, /* 11 1 0111 */ - { 4, LD+F+S+U }, /* 11 1 1000: lfsux */ - { 8, LD+F+U }, /* 11 1 1001: lfdux */ - { 4, ST+F+S+U }, /* 11 1 1010: stfsux */ - { 8, ST+F+U }, /* 11 1 1011: stfdux */ - INVALID, /* 11 1 1100 */ - { 4, LD+F }, /* 11 1 1101: lfiwzx */ - INVALID, /* 11 1 1110 */ - INVALID, /* 11 1 1111 */ -}; - -/* - * The dcbz (data cache block zero) instruction - * gives an alignment fault if used on non-cacheable - * memory. We handle the fault mainly for the - * case when we are running with the cache disabled - * for debugging. - */ -static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) -{ - long __user *p; - int i, size; - -#ifdef __powerpc64__ - size = ppc64_caches.l1d.block_size; -#else - size = L1_CACHE_BYTES; -#endif - p = (long __user *) (regs->dar & -size); - if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size)) - return -EFAULT; - for (i = 0; i < size / sizeof(long); ++i) - if (__put_user_inatomic(0, p+i)) - return -EFAULT; - return 1; -} - -/* - * Emulate load & store multiple instructions - * On 64-bit machines, these instructions only affect/use the - * bottom 4 bytes of each register, and the loads clear the - * top 4 bytes of the affected register. - */ -#ifdef __BIG_ENDIAN__ -#ifdef CONFIG_PPC64 -#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4) -#else -#define REG_BYTE(rp, i) *((u8 *)(rp) + (i)) -#endif -#else -#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3)))) -#endif - -#define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) - -static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, - unsigned int reg, unsigned int nb, - unsigned int flags, unsigned int instr, - unsigned long swiz) -{ - unsigned long *rptr; - unsigned int nb0, i, bswiz; - unsigned long p; - - /* - * We do not try to emulate 8 bytes multiple as they aren't really - * available in our operating environments and we don't try to - * emulate multiples operations in kernel land as they should never - * be used/generated there at least not on unaligned boundaries - */ - if (unlikely((nb > 4) || !user_mode(regs))) - return 0; - - /* lmw, stmw, lswi/x, stswi/x */ - nb0 = 0; - if (flags & HARD) { - if (flags & SX) { - nb = regs->xer & 127; - if (nb == 0) - return 1; - } else { - unsigned long pc = regs->nip ^ (swiz & 4); - - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) - return -EFAULT; - if (swiz == 0 && (flags & SW)) - instr = cpu_to_le32(instr); - nb = (instr >> 11) & 0x1f; - if (nb == 0) - nb = 32; - } - if (nb + reg * 4 > 128) { - nb0 = nb + reg * 4 - 128; - nb = 128 - reg * 4; - } -#ifdef __LITTLE_ENDIAN__ - /* - * String instructions are endian neutral but the code - * below is not. Force byte swapping on so that the - * effects of swizzling are undone in the load/store - * loops below. - */ - flags ^= SW; -#endif - } else { - /* lwm, stmw */ - nb = (32 - reg) * 4; - } - - if (!access_ok((flags & ST ? VERIFY_WRITE: VERIFY_READ), addr, nb+nb0)) - return -EFAULT; /* bad address */ - - rptr = ®s->gpr[reg]; - p = (unsigned long) addr; - bswiz = (flags & SW)? 3: 0; - - if (!(flags & ST)) { - /* - * This zeroes the top 4 bytes of the affected registers - * in 64-bit mode, and also zeroes out any remaining - * bytes of the last register for lsw*. - */ - memset(rptr, 0, ((nb + 3) / 4) * sizeof(unsigned long)); - if (nb0 > 0) - memset(®s->gpr[0], 0, - ((nb0 + 3) / 4) * sizeof(unsigned long)); - - for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - if (nb0 > 0) { - rptr = ®s->gpr[0]; - addr += nb; - for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - } - - } else { - for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - if (nb0 > 0) { - rptr = ®s->gpr[0]; - addr += nb; - for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - } - } - return 1; -} - -/* - * Emulate floating-point pair loads and stores. - * Only POWER6 has these instructions, and it does true little-endian, - * so we don't need the address swizzling. - */ -static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, - unsigned int flags) -{ - char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); - char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); - int i, ret, sw = 0; - - if (reg & 1) - return 0; /* invalid form: FRS/FRT must be even */ - if (flags & SW) - sw = 7; - ret = 0; - for (i = 0; i < 8; ++i) { - if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); - } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); - } - } - if (ret) - return -EFAULT; - return 1; /* exception handled and fixed up */ -} - -#ifdef CONFIG_PPC64 -static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, - unsigned int reg, unsigned int flags) -{ - char *ptr0 = (char *)®s->gpr[reg]; - char *ptr1 = (char *)®s->gpr[reg+1]; - int i, ret, sw = 0; - - if (reg & 1) - return 0; /* invalid form: GPR must be even */ - if (flags & SW) - sw = 7; - ret = 0; - for (i = 0; i < 8; ++i) { - if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); - } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); - } - } - if (ret) - return -EFAULT; - return 1; /* exception handled and fixed up */ -} -#endif /* CONFIG_PPC64 */ #ifdef CONFIG_SPE @@ -636,133 +282,21 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, } #endif /* CONFIG_SPE */ -#ifdef CONFIG_VSX -/* - * Emulate VSX instructions... - */ -static int emulate_vsx(unsigned char __user *addr, unsigned int reg, - unsigned int areg, struct pt_regs *regs, - unsigned int flags, unsigned int length, - unsigned int elsize) -{ - char *ptr; - unsigned long *lptr; - int ret = 0; - int sw = 0; - int i, j; - - /* userland only */ - if (unlikely(!user_mode(regs))) - return 0; - - flush_vsx_to_thread(current); - - if (reg < 32) - ptr = (char *) ¤t->thread.fp_state.fpr[reg][0]; - else - ptr = (char *) ¤t->thread.vr_state.vr[reg - 32]; - - lptr = (unsigned long *) ptr; - -#ifdef __LITTLE_ENDIAN__ - if (flags & SW) { - elsize = length; - sw = length-1; - } else { - /* - * The elements are BE ordered, even in LE mode, so process - * them in reverse order. - */ - addr += length - elsize; - - /* 8 byte memory accesses go in the top 8 bytes of the VR */ - if (length == 8) - ptr += 8; - } -#else - if (flags & SW) - sw = elsize-1; -#endif - - for (j = 0; j < length; j += elsize) { - for (i = 0; i < elsize; ++i) { - if (flags & ST) - ret |= __put_user(ptr[i^sw], addr + i); - else - ret |= __get_user(ptr[i^sw], addr + i); - } - ptr += elsize; -#ifdef __LITTLE_ENDIAN__ - addr -= elsize; -#else - addr += elsize; -#endif - } - -#ifdef __BIG_ENDIAN__ -#define VSX_HI 0 -#define VSX_LO 1 -#else -#define VSX_HI 1 -#define VSX_LO 0 -#endif - - if (!ret) { - if (flags & U) - regs->gpr[areg] = regs->dar; - - /* Splat load copies the same data to top and bottom 8 bytes */ - if (flags & SPLT) - lptr[VSX_LO] = lptr[VSX_HI]; - /* For 8 byte loads, zero the low 8 bytes */ - else if (!(flags & ST) && (8 == length)) - lptr[VSX_LO] = 0; - } else - return -EFAULT; - - return 1; -} -#endif - /* * Called on alignment exception. Attempts to fixup * * Return 1 on success * Return 0 if unable to handle the interrupt * Return -EFAULT if data address is bad + * Other negative return values indicate that the instruction can't + * be emulated, and the process should be given a SIGBUS. */ int fix_alignment(struct pt_regs *regs) { - unsigned int instr, nb, flags, instruction = 0; - unsigned int reg, areg; - unsigned int dsisr; - unsigned char __user *addr; - unsigned long p, swiz; - int ret, i; - union data { - u64 ll; - double dd; - unsigned char v[8]; - struct { -#ifdef __LITTLE_ENDIAN__ - int low32; - unsigned hi32; -#else - unsigned hi32; - int low32; -#endif - } x32; - struct { -#ifdef __LITTLE_ENDIAN__ - short low16; - unsigned char hi48[6]; -#else - unsigned char hi48[6]; - short low16; -#endif - } x16; - } data; + unsigned int instr; + struct instruction_op op; + int r, type; /* * We require a complete register set, if not, then our assembly @@ -770,121 +304,23 @@ int fix_alignment(struct pt_regs *regs) */ CHECK_FULL_REGS(regs); - dsisr = regs->dsisr; - - /* Some processors don't provide us with a DSISR we can use here, - * let's make one up from the instruction - */ - if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) { - unsigned long pc = regs->nip; - - if (cpu_has_feature(CPU_FTR_PPC_LE) && (regs->msr & MSR_LE)) - pc ^= 4; - if (unlikely(__get_user_inatomic(instr, - (unsigned int __user *)pc))) - return -EFAULT; - if (cpu_has_feature(CPU_FTR_REAL_LE) && (regs->msr & MSR_LE)) - instr = cpu_to_le32(instr); - dsisr = make_dsisr(instr); - instruction = instr; + if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip))) + return -EFAULT; + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { + /* We don't handle PPC little-endian any more... */ + if (cpu_has_feature(CPU_FTR_PPC_LE)) + return -EIO; + instr = swab32(instr); } - /* extract the operation and registers from the dsisr */ - reg = (dsisr >> 5) & 0x1f; /* source/dest register */ - areg = dsisr & 0x1f; /* register to update */ - #ifdef CONFIG_SPE if ((instr >> 26) == 0x4) { + int reg = (instr >> 21) & 0x1f; PPC_WARN_ALIGNMENT(spe, regs); return emulate_spe(regs, reg, instr); } #endif - instr = (dsisr >> 10) & 0x7f; - instr |= (dsisr >> 13) & 0x60; - - /* Lookup the operation in our table */ - nb = aligninfo[instr].len; - flags = aligninfo[instr].flags; - - /* - * Handle some cases which give overlaps in the DSISR values. - */ - if (IS_XFORM(instruction)) { - switch (get_xop(instruction)) { - case 532: /* ldbrx */ - nb = 8; - flags = LD+SW; - break; - case 660: /* stdbrx */ - nb = 8; - flags = ST+SW; - break; - case 20: /* lwarx */ - case 84: /* ldarx */ - case 116: /* lharx */ - case 276: /* lqarx */ - return 0; /* not emulated ever */ - } - } - - /* Byteswap little endian loads and stores */ - swiz = 0; - if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { - flags ^= SW; -#ifdef __BIG_ENDIAN__ - /* - * So-called "PowerPC little endian" mode works by - * swizzling addresses rather than by actually doing - * any byte-swapping. To emulate this, we XOR each - * byte address with 7. We also byte-swap, because - * the processor's address swizzling depends on the - * operand size (it xors the address with 7 for bytes, - * 6 for halfwords, 4 for words, 0 for doublewords) but - * we will xor with 7 and load/store each byte separately. - */ - if (cpu_has_feature(CPU_FTR_PPC_LE)) - swiz = 7; -#endif - } - - /* DAR has the operand effective address */ - addr = (unsigned char __user *)regs->dar; - -#ifdef CONFIG_VSX - if ((instruction & 0xfc00003e) == 0x7c000018) { - unsigned int elsize; - - /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ - reg |= (instruction & 0x1) << 5; - /* Simple inline decoder instead of a table */ - /* VSX has only 8 and 16 byte memory accesses */ - nb = 8; - if (instruction & 0x200) - nb = 16; - - /* Vector stores in little-endian mode swap individual - elements, so process them separately */ - elsize = 4; - if (instruction & 0x80) - elsize = 8; - - flags = 0; - if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) - flags |= SW; - if (instruction & 0x100) - flags |= ST; - if (instruction & 0x040) - flags |= U; - /* splat load needs a special decoder */ - if ((instruction & 0x400) == 0){ - flags |= SPLT; - nb = 8; - } - PPC_WARN_ALIGNMENT(vsx, regs); - return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); - } -#endif /* * ISA 3.0 (such as P9) copy, copy_first, paste and paste_last alignment @@ -896,173 +332,27 @@ int fix_alignment(struct pt_regs *regs) * when pasting to a co-processor. Furthermore, paste_last is the * synchronisation point for preceding copy/paste sequences. */ - if ((instruction & 0xfc0006fe) == PPC_INST_COPY) + if ((instr & 0xfc0006fe) == (PPC_INST_COPY & 0xfc0006fe)) return -EIO; - /* A size of 0 indicates an instruction we don't support, with - * the exception of DCBZ which is handled as a special case here - */ - if (instr == DCBZ) { - PPC_WARN_ALIGNMENT(dcbz, regs); - return emulate_dcbz(regs, addr); - } - if (unlikely(nb == 0)) - return 0; - - /* Load/Store Multiple instructions are handled in their own - * function - */ - if (flags & M) { - PPC_WARN_ALIGNMENT(multiple, regs); - return emulate_multiple(regs, addr, reg, nb, - flags, instr, swiz); - } - - /* Verify the address of the operand */ - if (unlikely(user_mode(regs) && - !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), - addr, nb))) - return -EFAULT; - - /* Force the fprs into the save area so we can reference them */ - if (flags & F) { - /* userland only */ - if (unlikely(!user_mode(regs))) - return 0; - flush_fp_to_thread(current); - } + r = analyse_instr(&op, regs, instr); + if (r < 0) + return -EINVAL; - if (nb == 16) { - if (flags & F) { - /* Special case for 16-byte FP loads and stores */ - PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); - } else { -#ifdef CONFIG_PPC64 - /* Special case for 16-byte loads and stores */ - PPC_WARN_ALIGNMENT(lq_stq, regs); - return emulate_lq_stq(regs, addr, reg, flags); -#else - return 0; -#endif - } - } - - PPC_WARN_ALIGNMENT(unaligned, regs); - - /* If we are loading, get the data from user space, else - * get it from register values - */ - if (!(flags & ST)) { - unsigned int start = 0; - - switch (nb) { - case 4: - start = offsetof(union data, x32.low32); - break; - case 2: - start = offsetof(union data, x16.low16); - break; - } - - data.ll = 0; - ret = 0; - p = (unsigned long)addr; - - for (i = 0; i < nb; i++) - ret |= __get_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; - - } else if (flags & F) { - data.ll = current->thread.TS_FPR(reg); - if (flags & S) { - /* Single-precision FP store requires conversion... */ -#ifdef CONFIG_PPC_FPU - preempt_disable(); - enable_kernel_fp(); - cvt_df(&data.dd, (float *)&data.x32.low32); - disable_kernel_fp(); - preempt_enable(); -#else - return 0; -#endif - } - } else - data.ll = regs->gpr[reg]; - - if (flags & SW) { - switch (nb) { - case 8: - data.ll = swab64(data.ll); - break; - case 4: - data.x32.low32 = swab32(data.x32.low32); - break; - case 2: - data.x16.low16 = swab16(data.x16.low16); - break; - } - } - - /* Perform other misc operations like sign extension - * or floating point single precision conversion - */ - switch (flags & ~(U|SW)) { - case LD+SE: /* sign extending integer loads */ - case LD+F+SE: /* sign extend for lfiwax */ - if ( nb == 2 ) - data.ll = data.x16.low16; - else /* nb must be 4 */ - data.ll = data.x32.low32; - break; - - /* Single-precision FP load requires conversion... */ - case LD+F+S: -#ifdef CONFIG_PPC_FPU - preempt_disable(); - enable_kernel_fp(); - cvt_fd((float *)&data.x32.low32, &data.dd); - disable_kernel_fp(); - preempt_enable(); -#else - return 0; -#endif - break; + type = op.type & INSTR_TYPE_MASK; + if (!OP_IS_LOAD_STORE(type)) { + if (op.type != CACHEOP + DCBZ) + return -EINVAL; + PPC_WARN_ALIGNMENT(dcbz, regs); + r = emulate_dcbz(op.ea, regs); + } else { + if (type == LARX || type == STCX) + return -EIO; + PPC_WARN_ALIGNMENT(unaligned, regs); + r = emulate_loadstore(regs, &op); } - /* Store result to memory or update registers */ - if (flags & ST) { - unsigned int start = 0; - - switch (nb) { - case 4: - start = offsetof(union data, x32.low32); - break; - case 2: - start = offsetof(union data, x16.low16); - break; - } - - ret = 0; - p = (unsigned long)addr; - - for (i = 0; i < nb; i++) - ret |= __put_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; - } else if (flags & F) - current->thread.TS_FPR(reg) = data.ll; - else - regs->gpr[reg] = data.ll; - - /* Update RA as needed */ - if (flags & U) - regs->gpr[areg] = regs->dar; - - return 1; + if (!r) + return 1; + return r; } diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6e95c2c19a7e..6b958414b4e0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,7 +185,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); - DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); + OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ #endif @@ -208,7 +208,7 @@ int main(void) OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); @@ -230,7 +230,7 @@ int main(void) OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx); OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count); OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ OFFSET(PACAEMERGSP, paca_struct, emergency_sp); #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); @@ -642,6 +642,7 @@ int main(void) HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); + HSTATE_FIELD(HSTATE_TID, tid); HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]); HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]); HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]); @@ -667,6 +668,8 @@ int main(void) OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar); OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap); OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped); + OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set); + OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_PPC_BOOK3S_64 @@ -746,6 +749,14 @@ int main(void) OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); +#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) + STOP_SPR(STOP_PID, pid); + STOP_SPR(STOP_LDBAR, ldbar); + STOP_SPR(STOP_FSCR, fscr); + STOP_SPR(STOP_HFSCR, hfscr); + STOP_SPR(STOP_MMCR1, mmcr1); + STOP_SPR(STOP_MMCR2, mmcr2); + STOP_SPR(STOP_MMCRA, mmcra); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/audit.c b/arch/powerpc/kernel/audit.c index a4dab7cab348..a2dddd7f3d09 100644 --- a/arch/powerpc/kernel/audit.c +++ b/arch/powerpc/kernel/audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/types.h> #include <linux/audit.h> diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 8275858a434d..6537cba1a758 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Procedures for drawing on the screen early on in the boot process. * @@ -253,7 +254,7 @@ int __init btext_find_display(int allow_nonstdout) for_each_node_by_type(np, "display") { if (of_get_property(np, "linux,opened", NULL)) { - printk("trying %s ...\n", np->full_name); + printk("trying %pOF ...\n", np); rc = btext_initialize(np); printk("result: %d\n", rc); } diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index c641983bbdd6..a8f20e5928e1 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -167,10 +167,10 @@ static void release_cache_debugcheck(struct cache *cache) list_for_each_entry(iter, &cache_list, list) WARN_ONCE(iter->next_local == cache, - "cache for %s(%s) refers to cache for %s(%s)\n", - iter->ofnode->full_name, + "cache for %pOF(%s) refers to cache for %pOF(%s)\n", + iter->ofnode, cache_type_string(iter), - cache->ofnode->full_name, + cache->ofnode, cache_type_string(cache)); } @@ -179,8 +179,8 @@ static void release_cache(struct cache *cache) if (!cache) return; - pr_debug("freeing L%d %s cache for %s\n", cache->level, - cache_type_string(cache), cache->ofnode->full_name); + pr_debug("freeing L%d %s cache for %pOF\n", cache->level, + cache_type_string(cache), cache->ofnode); release_cache_debugcheck(cache); list_del(&cache->list); @@ -194,8 +194,8 @@ static void cache_cpu_set(struct cache *cache, int cpu) while (next) { WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map), - "CPU %i already accounted in %s(%s)\n", - cpu, next->ofnode->full_name, + "CPU %i already accounted in %pOF(%s)\n", + cpu, next->ofnode, cache_type_string(next)); cpumask_set_cpu(cpu, &next->shared_cpu_map); next = next->next_local; @@ -355,7 +355,7 @@ static int cache_is_unified_d(const struct device_node *np) */ static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) { - pr_debug("creating L%d ucache for %s\n", level, node->full_name); + pr_debug("creating L%d ucache for %pOF\n", level, node); return new_cache(cache_is_unified_d(node), level, node); } @@ -365,8 +365,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node, { struct cache *dcache, *icache; - pr_debug("creating L%d dcache and icache for %s\n", level, - node->full_name); + pr_debug("creating L%d dcache and icache for %pOF\n", level, + node); dcache = new_cache(CACHE_TYPE_DATA, level, node); icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); @@ -679,7 +679,6 @@ static struct kobj_type cache_index_type = { static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) { - const char *cache_name; const char *cache_type; struct cache *cache; char *buf; @@ -690,7 +689,6 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) return; cache = dir->cache; - cache_name = cache->ofnode->full_name; cache_type = cache_type_string(cache); /* We don't want to create an attribute that can't provide a @@ -707,14 +705,14 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) rc = attr->show(&dir->kobj, attr, buf); if (rc <= 0) { pr_debug("not creating %s attribute for " - "%s(%s) (rc = %zd)\n", - attr->attr.name, cache_name, + "%pOF(%s) (rc = %zd)\n", + attr->attr.name, cache->ofnode, cache_type, rc); continue; } if (sysfs_create_file(&dir->kobj, &attr->attr)) - pr_debug("could not create %s attribute for %s(%s)\n", - attr->attr.name, cache_name, cache_type); + pr_debug("could not create %s attribute for %pOF(%s)\n", + attr->attr.name, cache->ofnode, cache_type); } kfree(buf); @@ -831,8 +829,8 @@ static void cache_cpu_clear(struct cache *cache, int cpu) struct cache *next = cache->next_local; WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map), - "CPU %i not accounted in %s(%s)\n", - cpu, cache->ofnode->full_name, + "CPU %i not accounted in %pOF(%s)\n", + cpu, cache->ofnode, cache_type_string(cache)); cpumask_clear_cpu(cpu, &cache->shared_cpu_map); diff --git a/arch/powerpc/kernel/cacheinfo.h b/arch/powerpc/kernel/cacheinfo.h index a7b74d36acd7..955f5e999f1b 100644 --- a/arch/powerpc/kernel/cacheinfo.h +++ b/arch/powerpc/kernel/cacheinfo.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PPC_CACHEINFO_H #define _PPC_CACHEINFO_H diff --git a/arch/powerpc/kernel/compat_audit.c b/arch/powerpc/kernel/compat_audit.c index 108ff14e2122..55c6ccda0a85 100644 --- a/arch/powerpc/kernel/compat_audit.c +++ b/arch/powerpc/kernel/compat_audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #undef __powerpc64__ #include <asm/unistd.h> diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 6f849832a669..1350f49d81a8 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -547,11 +547,31 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, - { /* Power9 */ + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD2_0, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power9", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, + { /* Power9 DD 2.1 or later (see DD2.0 above) */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9, + .cpu_features = CPU_FTRS_POWER9_DD2_1, .cpu_user_features = COMMON_USER_POWER9, .cpu_user_features2 = COMMON_USER2_POWER9, .mmu_features = MMU_FTRS_POWER9, @@ -1259,10 +1279,10 @@ static struct cpu_spec __initdata cpu_specs[] = { .platform = "ppc603", }, #endif /* CONFIG_PPC_BOOK3S_32 */ -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx { /* 8xx */ .pvr_mask = 0xffff0000, - .pvr_value = 0x00500000, + .pvr_value = PVR_8xx, .cpu_name = "8xx", /* CPU_FTR_MAYBE_CAN_DOZE is possible, * if the 8xx code is there.... */ @@ -1274,7 +1294,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_8xx, .platform = "ppc823", }, -#endif /* CONFIG_8xx */ +#endif /* CONFIG_PPC_8xx */ #ifdef CONFIG_40x { /* 403GC */ .pvr_mask = 0xffffff00, @@ -1936,6 +1956,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_440A, .platform = "ppc440", }, +#ifdef CONFIG_PPC_47x { /* 476 DD2 core */ .pvr_mask = 0xffffffff, .pvr_value = 0x11a52080, @@ -1992,6 +2013,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_47x, .platform = "ppc470", }, +#endif /* CONFIG_PPC_47x */ { /* default match */ .pvr_mask = 0x00000000, .pvr_value = 0x00000000, diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 8f7abf9baa63..66f33e7f8d40 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation * diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1df770e8cbe0..8bdc2f96c5d6 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -102,10 +102,10 @@ static void cpufeatures_flush_tlb(void) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: - __flush_tlb_power8(POWER8_TLB_SETS); + __flush_tlb_power8(TLB_INVAL_SCOPE_GLOBAL); break; case PVR_POWER9: - __flush_tlb_power9(POWER9_TLB_SETS_HASH); + __flush_tlb_power9(TLB_INVAL_SCOPE_GLOBAL); break; default: pr_err("unknown CPU version for boot TLB flush\n"); @@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata {"no-execute", feat_enable, 0}, {"strong-access-ordering", feat_enable, CPU_FTR_SAO}, {"cache-inhibited-large-page", feat_enable_large_ci, 0}, - {"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX}, + {"coprocessor-icswx", feat_enable, 0}, {"hypervisor-virtualization-interrupt", feat_enable_hvi, 0}, {"program-priority-register", feat_enable, CPU_FTR_HAS_PPR}, {"wait", feat_enable, 0}, @@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void) */ if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; + else if ((version & 0xffffefff) == 0x004e0201) + cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; } static void __init cpufeatures_setup_finished(void) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 63992b2d8e15..cbca0a667682 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -44,6 +44,7 @@ #include <asm/machdep.h> #include <asm/ppc-pci.h> #include <asm/rtas.h> +#include <asm/pte-walk.h> /** Overview: @@ -169,10 +170,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) char buffer[128]; n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg); @@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) * worried about _PAGE_SPLITTING/collapse. Also we will not hit * page table free, because of init_mm. */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(token, &hugepage_shift); if (!ptep) return token; WARN_ON(hugepage_shift); @@ -435,7 +435,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) int ret; int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); unsigned long flags; - struct pci_dn *pdn; + struct device_node *dn; struct pci_dev *dev; struct eeh_pe *pe, *parent_pe, *phb_pe; int rc = 0; @@ -493,9 +493,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; if (pe->check_count % EEH_MAX_FAILS == 0) { - pdn = eeh_dev_to_pdn(edev); - if (pdn->node) - location = of_get_property(pdn->node, "ibm,loc-code", NULL); + dn = pci_device_to_OF_node(dev); + if (dn) + location = of_get_property(dn, "ibm,loc-code", + NULL); printk(KERN_ERR "EEH: %d reads ignored for recovering device at " "location=%s driver=%s pci addr=%s\n", pe->check_count, @@ -971,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; +void eeh_probe_devices(void) +{ + struct pci_controller *hose, *tmp; + struct pci_dn *pdn; + + /* Enable EEH for all adapters */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pdn = hose->pci_data; + traverse_pci_dn(pdn, eeh_ops->probe, NULL); + } +} + /** * eeh_init - EEH initialization * @@ -986,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = { * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -int eeh_init(void) +static int eeh_init(void) { struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - static int cnt = 0; int ret = 0; - /* - * We have to delay the initialization on PowerNV after - * the PCI hierarchy tree has been built because the PEs - * are figured out based on PCI devices instead of device - * tree nodes - */ - if (machine_is(powernv) && cnt++ <= 0) - return ret; - /* Register reboot notifier */ ret = register_reboot_notifier(&eeh_reboot_nb); if (ret) { @@ -1018,27 +1020,16 @@ int eeh_init(void) } else if ((ret = eeh_ops->init())) return ret; + /* Initialize PHB PEs */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(hose); + /* Initialize EEH event */ ret = eeh_event_init(); if (ret) return ret; - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - - /* - * Call platform post-initialization. Actually, It's good chance - * to inform platform that EEH is ready to supply service if the - * I/O cache stuff has been built up. - */ - if (eeh_ops->post_init) { - ret = eeh_ops->post_init(); - if (ret) - return ret; - } + eeh_probe_devices(); if (eeh_enabled()) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); @@ -1064,7 +1055,7 @@ core_initcall_sync(eeh_init); */ void eeh_add_device_early(struct pci_dn *pdn) { - struct pci_controller *phb; + struct pci_controller *phb = pdn ? pdn->phb : NULL; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); if (!edev) @@ -1074,7 +1065,6 @@ void eeh_add_device_early(struct pci_dn *pdn) return; /* USB Bus children of PCI devices will not have BUID's */ - phb = edev->phb; if (NULL == phb || (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid)) return; @@ -1753,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val) else eeh_add_flag(EEH_FORCE_DISABLED); - /* Notify the backend */ - if (eeh_ops->post_init) - eeh_ops->post_init(); - return 0; } diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index d6b2ca70d14d..a34e6912c15e 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -50,21 +50,16 @@ */ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) { - struct pci_controller *phb = pdn->phb; struct eeh_dev *edev; /* Allocate EEH device */ edev = kzalloc(sizeof(*edev), GFP_KERNEL); - if (!edev) { - pr_warn("%s: out of memory\n", - __func__); + if (!edev) return NULL; - } /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; - edev->phb = phb; INIT_LIST_HEAD(&edev->list); INIT_LIST_HEAD(&edev->rmv_list); @@ -83,21 +78,3 @@ void eeh_dev_phb_init_dynamic(struct pci_controller *phb) /* EEH PE for PHB */ eeh_phb_pe_create(phb); } - -/** - * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs - * - * Scan all the existing PHBs and create EEH devices for their OF - * nodes and their children OF nodes - */ -static int __init eeh_dev_phb_init(void) -{ - struct pci_controller *phb, *tmp; - - list_for_each_entry_safe(phb, tmp, &hose_list, list_node) - eeh_dev_phb_init_dynamic(phb); - - return 0; -} - -core_initcall(eeh_dev_phb_init); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index c405c79e50cd..4f71e4c9beb7 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -428,7 +428,7 @@ static void *eeh_add_virt_device(void *data, void *userdata) if (!(edev->physfn)) { pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", - __func__, edev->phb->global_number, pdn->busno, + __func__, pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); return NULL; } @@ -441,7 +441,7 @@ static void *eeh_add_virt_device(void *data, void *userdata) } #ifdef CONFIG_PPC_POWERNV - pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0); + pci_iov_add_virtfn(edev->physfn, pdn->vf_index); #endif return NULL; } @@ -499,7 +499,7 @@ static void *eeh_rmv_device(void *data, void *userdata) #ifdef CONFIG_PPC_POWERNV struct pci_dn *pdn = eeh_dev_to_pdn(edev); - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0); + pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); edev->pdev = NULL; /* @@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, struct eeh_rmv_data *rmv_data) { struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); - struct timeval tstamp; + time64_t tstamp; int cnt, rc; struct eeh_dev *edev; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index cc4b206f77e4..2d4956e97aa9 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -230,10 +230,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root, * Bus/Device/Function number. The extra data referred by flag * indicates which type of address should be used. */ +struct eeh_pe_get_flag { + int pe_no; + int config_addr; +}; + static void *__eeh_pe_get(void *data, void *flag) { struct eeh_pe *pe = (struct eeh_pe *)data; - struct eeh_dev *edev = (struct eeh_dev *)flag; + struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag; /* Unexpected PHB PE */ if (pe->type & EEH_PE_PHB) @@ -244,17 +249,17 @@ static void *__eeh_pe_get(void *data, void *flag) * have non-zero PE address */ if (eeh_has_flag(EEH_VALID_PE_ZERO)) { - if (edev->pe_config_addr == pe->addr) + if (tmp->pe_no == pe->addr) return pe; } else { - if (edev->pe_config_addr && - (edev->pe_config_addr == pe->addr)) + if (tmp->pe_no && + (tmp->pe_no == pe->addr)) return pe; } /* Try BDF address */ - if (edev->config_addr && - (edev->config_addr == pe->config_addr)) + if (tmp->config_addr && + (tmp->config_addr == pe->config_addr)) return pe; return NULL; @@ -262,7 +267,9 @@ static void *__eeh_pe_get(void *data, void *flag) /** * eeh_pe_get - Search PE based on the given address - * @edev: EEH device + * @phb: PCI controller + * @pe_no: PE number + * @config_addr: Config address * * Search the corresponding PE based on the specified address which * is included in the eeh device. The function is used to check if @@ -271,12 +278,14 @@ static void *__eeh_pe_get(void *data, void *flag) * which is composed of PCI bus/device/function number, or unified * PE address. */ -struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, + int pe_no, int config_addr) { - struct eeh_pe *root = eeh_phb_pe_get(edev->phb); + struct eeh_pe *root = eeh_phb_pe_get(phb); + struct eeh_pe_get_flag tmp = { pe_no, config_addr }; struct eeh_pe *pe; - pe = eeh_pe_traverse(root, __eeh_pe_get, edev); + pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp); return pe; } @@ -330,11 +339,13 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) int eeh_add_to_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + int config_addr = (pdn->busno << 8) | (pdn->devfn); /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", - __func__, edev->config_addr, edev->phb->global_number); + __func__, config_addr, pdn->phb->global_number); return -EINVAL; } @@ -344,7 +355,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * PE should be composed of PCI bus and its subordinate * components. */ - pe = eeh_pe_get(edev); + pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); if (pe && !(pe->type & EEH_PE_INVALID)) { /* Mark the PE as type of PCI bus */ pe->type = EEH_PE_BUS; @@ -353,11 +364,11 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Put the edev to PE */ list_add_tail(&edev->list, &pe->edevs); pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), - pe->addr); + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), + pe->addr); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { list_add_tail(&edev->list, &pe->edevs); @@ -376,25 +387,25 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " "PE#%x, Parent PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), - pe->addr, pe->parent->addr); + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), + pe->addr, pe->parent->addr); return 0; } /* Create a new EEH PE */ if (edev->physfn) - pe = eeh_pe_alloc(edev->phb, EEH_PE_VF); + pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF); else - pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); + pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE); if (!pe) { pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } pe->addr = edev->pe_config_addr; - pe->config_addr = edev->config_addr; + pe->config_addr = config_addr; /* * Put the new EEH PE into hierarchy tree. If the parent @@ -404,10 +415,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) */ parent = eeh_pe_get_parent(edev); if (!parent) { - parent = eeh_phb_pe_get(edev->phb); + parent = eeh_phb_pe_get(pdn->phb); if (!parent) { pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", - __func__, edev->phb->global_number); + __func__, pdn->phb->global_number); edev->pe = NULL; kfree(pe); return -EEXIST; @@ -424,10 +435,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) edev->pe = pe; pr_debug("EEH: Add %04x:%02x:%02x.%01x to " "Device PE#%x, Parent PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), pe->addr, pe->parent->addr); return 0; @@ -446,13 +457,14 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; int cnt; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); if (!edev->pe) { pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", - __func__, edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF)); + __func__, pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn)); return -EEXIST; } @@ -514,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ void eeh_pe_update_time_stamp(struct eeh_pe *pe) { - struct timeval tstamp; + time64_t tstamp; if (!pe) return; if (pe->freeze_count <= 0) { pe->freeze_count = 0; - do_gettimeofday(&pe->tstamp); + pe->tstamp = ktime_get_seconds(); } else { - do_gettimeofday(&tstamp); - if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + tstamp = ktime_get_seconds(); + if (tstamp - pe->tstamp > 3600) { pe->tstamp = tstamp; pe->freeze_count = 0; } @@ -712,10 +724,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) return; pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", - __func__, edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF)); + __func__, pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn)); /* Check slot status */ cap = edev->pcie_cap; diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 1ceecdda810b..797549289798 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -51,7 +51,6 @@ static ssize_t eeh_show_##_name(struct device *dev, \ static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); EEH_SHOW_ATTR(eeh_mode, mode, "0x%x"); -EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x"); EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x"); static ssize_t eeh_pe_state_show(struct device *dev, @@ -103,7 +102,6 @@ void eeh_sysfs_add_device(struct pci_dev *pdev) return; rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_state); @@ -128,7 +126,6 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) } device_remove_file(&pdev->dev, &dev_attr_eeh_mode); - device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 8587059ad848..e780e1fbf6c2 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -43,6 +43,13 @@ #define LOAD_MSR_KERNEL(r, x) li r,(x) #endif +/* + * Align to 4k in order to ensure that all functions modyfing srr0/srr1 + * fit into one page in order to not encounter a TLB miss between the + * modification of srr0/srr1 and the associated rfi. + */ + .align 12 + #ifdef CONFIG_BOOKE .globl mcheck_transfer_to_handler mcheck_transfer_to_handler: @@ -586,6 +593,10 @@ ppc_swapcontext: handle_page_fault: stw r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_6xx + andis. r0,r5,DSISR_DABRMATCH@h + bne- handle_dabr_fault +#endif bl do_page_fault cmpwi r3,0 beq+ ret_from_except @@ -599,6 +610,17 @@ handle_page_fault: bl bad_page_fault b ret_from_except_full +#ifdef CONFIG_6xx + /* We have a data breakpoint exception - handle it */ +handle_dabr_fault: + SAVE_NVGPRS(r1) + lwz r0,_TRAP(r1) + clrrwi r0,r0,1 + stw r0,_TRAP(r1) + bl do_break + b ret_from_except_full +#endif + /* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 49d8422767b4..3320bcac7192 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -223,17 +223,27 @@ system_call_exit: andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- .Lsyscall_exit_work - /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ - li r7,MSR_FP + andi. r0,r8,MSR_FP + beq 2f #ifdef CONFIG_ALTIVEC - oris r7,r7,MSR_VEC@h + andis. r0,r8,MSR_VEC@h + bne 3f #endif - and r0,r8,r7 - cmpd r0,r7 - bne .Lsyscall_restore_math -.Lsyscall_restore_math_cont: +2: addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_BOOK3S + li r10,MSR_RI + mtmsrd r10,1 /* Restore RI */ +#endif + bl restore_math +#ifdef CONFIG_PPC_BOOK3S + li r11,0 + mtmsrd r11,1 +#endif + ld r8,_MSR(r1) + ld r3,RESULT(r1) + li r11,-MAX_ERRNO - cmpld r3,r11 +3: cmpld r3,r11 ld r5,_CCR(r1) bge- .Lsyscall_error .Lsyscall_error_cont: @@ -267,40 +277,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r5,_CCR(r1) b .Lsyscall_error_cont -.Lsyscall_restore_math: - /* - * Some initial tests from restore_math to avoid the heavyweight - * C code entry and MSR manipulations. - */ - LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK) - and. r0,r0,r8 - bne 1f - - ld r7,PACACURRENT(r13) - lbz r0,THREAD+THREAD_LOAD_FP(r7) -#ifdef CONFIG_ALTIVEC - lbz r6,THREAD+THREAD_LOAD_VEC(r7) - add r0,r0,r6 -#endif - cmpdi r0,0 - beq .Lsyscall_restore_math_cont - -1: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif - bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif - /* Restore volatiles, reload MSR from updated one */ - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO - b .Lsyscall_restore_math_cont - /* Traced system call support */ .Lsyscall_dotrace: bl save_nvgprs @@ -563,7 +539,7 @@ _GLOBAL(_switch) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION b 2f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) @@ -612,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE @@ -990,16 +966,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) #ifdef CONFIG_PPC_BOOK3E cmpwi cr0,r3,0x280 #else - BEGIN_FTR_SECTION - cmpwi cr0,r3,0xe80 - FTR_SECTION_ELSE - cmpwi cr0,r3,0xa00 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + cmpwi cr0,r3,0xa00 #endif /* CONFIG_PPC_BOOK3E */ bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; bl doorbell_exception - b ret_from_except #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ @@ -1133,7 +1104,7 @@ _ASM_NOKPROBE_SYMBOL(__enter_rtas) _ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 -1: .llong rtas_restore_regs +1: .8byte rtas_restore_regs rtas_restore_regs: /* relocation is on at this point */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9029afd1fa2a..e441b469dc8f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This file contains the 64-bit "server" PowerPC variant * of the low level exception handling including exception @@ -113,6 +114,7 @@ EXC_VIRT_NONE(0x4000, 0x100) cmpwi cr3,r10,2 ; \ BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ + KVMTEST_PR(n) ; \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else #define IDLETEST NOTEST @@ -129,6 +131,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100) EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) +TRAMP_KVM(PACA_EXNMI, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) @@ -232,7 +235,7 @@ BEGIN_FTR_SECTION addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,4 + cmpwi r10,MAX_MCE_DEPTH bgt 2f /* Check if we hit limit of 4 */ std r11,GPR1(r1) /* Save r1 on the stack. */ std r11,0(r1) /* make stack chain pointer */ @@ -541,7 +544,7 @@ EXC_COMMON_BEGIN(instruction_access_common) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) - andis. r4,r12,0x5820 + andis. r4,r12,DSISR_SRR1_MATCH_64S@h li r5,0x400 std r3,_DAR(r1) std r4,_DSISR(r1) @@ -605,7 +608,7 @@ EXC_COMMON_BEGIN(slb_miss_common) cmpdi cr5,r11,MSR_RI crset 4*cr0+eq -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) @@ -734,7 +737,29 @@ EXC_REAL(program_check, 0x700, 0x100) EXC_VIRT(program_check, 0x4700, 0x100, 0x700) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + /* + * It's possible to receive a TM Bad Thing type program check with + * userspace register values (in particular r1), but with SRR1 reporting + * that we came from the kernel. Normally that would confuse the bad + * stack logic, and we would report a bad kernel stack pointer. Instead + * we switch to the emergency stack if we're taking a TM Bad Thing from + * the kernel. + */ + li r10,MSR_PR /* Build a mask of MSR_PR .. */ + oris r10,r10,0x200000@h /* .. and SRR1_PROGTM */ + and r10,r10,r12 /* Mask SRR1 with that. */ + srdi r10,r10,8 /* Shift it so we can compare */ + cmpldi r10,(0x200000 >> 8) /* .. with an immediate. */ + bne 1f /* If != go to normal path. */ + + /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack */ + andi. r10,r12,MSR_PR; /* Set CR0 correctly for label */ + /* 3 in EXCEPTION_PROLOG_COMMON */ + mr r10,r1 /* Save r1 */ + ld r1,PACAEMERGSP(r13) /* Use emergency stack */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + b 3f /* Jump into the macro !! */ +1: EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -865,12 +890,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -#define SYSCALL_FASTENDIAN_TEST \ -BEGIN_FTR_SECTION \ - cmpdi r0,0x1ebe ; \ - beq- 1f ; \ -END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - /* * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, * and HMT_MEDIUM. @@ -885,6 +904,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ +#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH +#define SYSCALL_FASTENDIAN_TEST \ +BEGIN_FTR_SECTION \ + cmpdi r0,0x1ebe ; \ + beq- 1f ; \ +END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ + #define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ @@ -893,6 +919,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ +#else +#define SYSCALL_FASTENDIAN_TEST +#define SYSCALL_FASTENDIAN +#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */ #if defined(CONFIG_RELOCATABLE) /* @@ -1010,6 +1040,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + cmpdi cr0,r3,0 + /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1026,10 +1058,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early) REST_8GPRS(2, r1) REST_GPR(10, r1) ld r11,_CCR(r1) + REST_2GPRS(12, r1) + bne 1f mtcr r11 REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ + ld r1,GPR1(r1) + hrfid + +1: mtcr r11 + REST_GPR(11, r1) ld r1,GPR1(r1) /* @@ -1042,8 +1079,9 @@ hmi_exception_after_realmode: EXCEPTION_PROLOG_0(PACA_EXGEN) b tramp_real_hmi_exception -EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) - +EXC_COMMON_BEGIN(hmi_exception_common) +EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception, + ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON) EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) @@ -1314,7 +1352,7 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) +#ifdef CONFIG_PPC_WATCHDOG #define MASKED_DEC_HANDLER_LABEL 3f @@ -1325,20 +1363,28 @@ EXC_VIRT_NONE(0x5800, 0x100) std r10,PACA_EXGEN+EX_R13(r13); \ EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) +/* + * Branch to soft_nmi_interrupt using the emergency stack. The emergency + * stack is one that is usable by maskable interrupts so long as MSR_EE + * remains off. It is used for recovery when something has corrupted the + * normal kernel stack, for example. The "soft NMI" must not use the process + * stack because we want irq disabled sections to avoid touching the stack + * at all (other than PMU interrupts), so use the emergency stack for this, + * and run it entirely with interrupts hard disabled. + */ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) - ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900, system_reset, soft_nmi_interrupt, ADD_NVGPRS;ADD_RECONCILE) b ret_from_except -#else +#else /* CONFIG_PPC_WATCHDOG */ #define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ #define MASKED_DEC_HANDLER(_H) -#endif +#endif /* CONFIG_PPC_WATCHDOG */ /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -1362,19 +1408,16 @@ masked_##_H##interrupt: \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ b MASKED_DEC_HANDLER_LABEL; \ -1: cmpwi r10,PACA_IRQ_DBELL; \ - beq 2f; \ - cmpwi r10,PACA_IRQ_HMI; \ - beq 2f; \ +1: andi. r10,r10,(PACA_IRQ_DBELL|PACA_IRQ_HMI); \ + bne 2f; \ mfspr r10,SPRN_##_H##SRR1; \ - rldicl r10,r10,48,1; /* clear MSR_EE */ \ - rotldi r10,r10,16; \ + xori r10,r10,MSR_EE; /* clear MSR_EE */ \ mtspr SPRN_##_H##SRR1,r10; \ 2: mtcrf 0x80,r9; \ ld r9,PACA_EXGEN+EX_R9(r13); \ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ - GET_SCRATCH0(r13); \ + /* returns to kernel where r13 must be set up, so don't restore it */ \ ##_H##rfid; \ b .; \ MASKED_DEC_HANDLER(_H) @@ -1477,8 +1520,10 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: -#ifdef CONFIG_PPC_STD_MMU_64 - andis. r0,r4,0xa450 /* weird error? */ +#ifdef CONFIG_PPC_BOOK3S_64 + lis r0,(DSISR_BAD_FAULT_64S|DSISR_DABRMATCH)@h + ori r0,r0,DSISR_BAD_FAULT_64S@l + and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ CURRENT_THREAD_INFO(r11, r1) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ @@ -1506,7 +1551,7 @@ do_hash_page: /* Reload DSISR into r4 for the DABR check below */ ld r4,_DSISR(r1) -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1535,7 +1580,7 @@ handle_dabr_fault: 12: b ret_from_except_lite -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ @@ -1661,25 +1706,27 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - LOAD_REG_ADDR(r11, 1f) + LOAD_REG_ADDR(r11, replay_interrupt_return) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 beq decrementer_common cmpwi r3,0x500 +BEGIN_FTR_SECTION + beq h_virt_irq_common +FTR_SECTION_ELSE beq hardware_interrupt_common +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300) BEGIN_FTR_SECTION - cmpwi r3,0xe80 + cmpwi r3,0xa00 beq h_doorbell_common_msgclr - cmpwi r3,0xea0 - beq h_virt_irq_common cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -1: +replay_interrupt_return: blr _ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index dc0c49cfd90a..04ea5c04fd24 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -125,6 +125,13 @@ int is_fadump_boot_memory_area(u64 addr, ulong size) return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; } +int should_fadump_crash(void) +{ + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return 0; + return 1; +} + int is_fadump_active(void) { return fw_dump.dump_active; @@ -518,7 +525,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) struct fadump_crash_info_header *fdh = NULL; int old_cpu, this_cpu; - if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + if (!should_fadump_crash()) return; /* @@ -1263,10 +1270,15 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + int input = -1; + if (!fw_dump.dump_active) return -EPERM; - if (buf[0] == '1') { + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { /* * Take away the '/proc/vmcore'. We are releasing the dump * memory, hence it will not be valid anymore. @@ -1300,21 +1312,25 @@ static ssize_t fadump_register_store(struct kobject *kobj, const char *buf, size_t count) { int ret = 0; + int input = -1; if (!fw_dump.fadump_enabled || fdm_active) return -EPERM; + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + mutex_lock(&fadump_mutex); - switch (buf[0]) { - case '0': + switch (input) { + case 0: if (fw_dump.dump_registered == 0) { goto unlock_out; } /* Un-register Firmware-assisted dump */ fadump_unregister_dump(&fdm); break; - case '1': + case 1: if (fw_dump.dump_registered == 1) { ret = -EEXIST; goto unlock_out; @@ -1446,6 +1462,25 @@ static void fadump_init_files(void) return; } +static int fadump_panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything + * else. If this returns, then fadump was not registered, so + * go through the rest of the panic path. + */ + crash_fadump(NULL, ptr); + + return NOTIFY_DONE; +} + +static struct notifier_block fadump_panic_block = { + .notifier_call = fadump_panic_event, + .priority = INT_MIN /* may not return; must be done last */ +}; + /* * Prepare for firmware-assisted dump. */ @@ -1478,6 +1513,9 @@ int __init setup_fadump(void) init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); fadump_init_files(); + atomic_notifier_chain_register(&panic_notifier_list, + &fadump_panic_block); + return 1; } subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index 83dd0f6776b3..ea065282b303 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* 1. Find the index of the entry we're executing in */ bl invstr /* Find our address */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e22734278458..29b2fed93289 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -388,7 +388,7 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) - andis. r0,r10,0xa470 /* weird error? */ + andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ @@ -403,13 +403,13 @@ DataAccess: DO_KVM 0x400 InstructionAccess: EXCEPTION_PROLOG - andis. r0,r9,0x4000 /* no pte found? */ + andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */ beq 1f /* if so, try to put a PTE */ li r3,0 /* into the hash table */ mr r4,r12 /* SRR0 is fault address */ bl hash_page 1: mr r4,r12 - mr r5,r9 + andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0ddc602b33a4..aa71a90f5222 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -55,12 +55,18 @@ * * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. - * 2. The kernel is entered at __start + * 2. The primary CPU enters at __start. + * 3. If the RTAS supports "query-cpu-stopped-state", then secondary + * CPUs will enter as directed by "start-cpu" RTAS call, which is + * generic_secondary_smp_init, with PIR in r3. + * 4. Else the secondary CPUs will enter at secondary_hold (0x60) as + * directed by the "start-cpu" RTS call, with PIR in r3. * -or- For OPAL entry: - * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 - * with device-tree in gpr3. We also get OPAL base in r8 and - * entry in r9 for debugging purposes - * 2. Secondary processors enter at 0x60 with PIR in gpr3 + * 1. The MMU is off, processor in HV mode. + * 2. The primary CPU enters at 0 with device-tree in r3, OPAL base + * in r8, and entry in r9 for debugging purposes. + * 3. Secondary CPUs enter as directed by OPAL_START_CPU call, which + * is at generic_secondary_smp_init, with PIR in r3. * * For Book3E processors: * 1. The MMU is on running in AS0 in a state defined in ePAPR @@ -92,13 +98,13 @@ END_FTR_SECTION(0, 1) .balign 8 .globl __secondary_hold_spinloop __secondary_hold_spinloop: - .llong 0x0 + .8byte 0x0 /* Secondary processors write this value with their cpu # */ /* after they enter the spin loop immediately below. */ .globl __secondary_hold_acknowledge __secondary_hold_acknowledge: - .llong 0x0 + .8byte 0x0 #ifdef CONFIG_RELOCATABLE /* This flag is set to 1 by a loader if the kernel should run @@ -650,7 +656,7 @@ __after_prom_start: bctr .balign 8 -p_end: .llong _end - copy_to_here +p_end: .8byte _end - copy_to_here 4: /* @@ -892,7 +898,7 @@ _GLOBAL(relative_toc) blr .balign 8 -p_toc: .llong __toc_start + 0x8000 - 0b +p_toc: .8byte __toc_start + 0x8000 - 0b /* * This is where the main kernel code starts. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index c032fe8c2d26..4fee00d414e8 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -50,18 +50,20 @@ mtspr spr, reg #endif -/* Macro to test if an address is a kernel address */ #if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 -#define IS_KERNEL(tmp, addr) \ - andis. tmp, addr, 0x8000 /* Address >= 0x80000000 */ -#define BRANCH_UNLESS_KERNEL(label) beq label -#else -#define IS_KERNEL(tmp, addr) \ - rlwinm tmp, addr, 16, 16, 31; \ - cmpli cr0, tmp, PAGE_OFFSET >> 16 -#define BRANCH_UNLESS_KERNEL(label) blt label +/* By simply checking Address >= 0x80000000, we know if its a kernel address */ +#define SIMPLE_KERNEL_ADDRESS 1 #endif +/* + * We need an ITLB miss handler for kernel addresses if: + * - Either we have modules + * - Or we have not pinned the first 8M + */ +#if defined(CONFIG_MODULES) || !defined(CONFIG_PIN_TLB_TEXT) || \ + defined(CONFIG_DEBUG_PAGEALLOC) +#define ITLB_MISS_KERNEL 1 +#endif /* * Value for the bits that have fixed value in RPN entries. @@ -123,7 +125,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC rfi /* enables MMU */ /* @@ -170,7 +171,7 @@ turn_on_mmu: stw r1,0(r11); \ tovirt(r1,r11); /* set new kernel sp */ \ li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \ - MTMSRD(r10); /* (except for mach check in rtas) */ \ + mtmsr r10; \ stw r0,GPR0(r11); \ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) @@ -300,7 +301,7 @@ SystemCall: /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. */ - EXCEPTION(0x1000, SoftEmu, SoftwareEmulation, EXC_XFER_STD) + EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) . = 0x1100 /* @@ -325,7 +326,7 @@ SystemCall: #endif InstructionTLBMiss: -#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtspr SPRN_SPRG_SCRATCH2, r3 #endif EXCEPTION_PROLOG_0 @@ -343,15 +344,32 @@ InstructionTLBMiss: INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10) /* Only modules will cause ITLB Misses as we always * pin the first 8MB of kernel memory */ -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfcr r3 #endif -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) - IS_KERNEL(r11, r10) +#ifdef ITLB_MISS_KERNEL +#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) + andis. r11, r10, 0x8000 /* Address >= 0x80000000 */ +#else + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h +#ifndef CONFIG_PIN_TLB_TEXT + /* It is assumed that kernel code fits into the first 8M page */ +_ENTRY(ITLBMiss_cmp) + cmpli cr7, r11, (PAGE_OFFSET + 0x0800000)@h +#endif +#endif #endif mfspr r11, SPRN_M_TW /* Get level 1 table */ -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) - BRANCH_UNLESS_KERNEL(3f) +#ifdef ITLB_MISS_KERNEL +#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) + beq+ 3f +#else + blt+ 3f +#endif +#ifndef CONFIG_PIN_TLB_TEXT + blt cr7, ITLBMissLinear +#endif lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha 3: #endif @@ -369,7 +387,7 @@ InstructionTLBMiss: rlwimi r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */ lwz r10, 0(r10) /* Get the pte */ 4: -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtcr r3 #endif /* Insert the APG into the TWC from the Linux PTE. */ @@ -400,7 +418,7 @@ InstructionTLBMiss: MTSPR_CPU6(SPRN_MI_RPN, r10, r3) /* Update TLB entry */ /* Restore registers */ -#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfspr r3, SPRN_SPRG_SCRATCH2 #endif EXCEPTION_EPILOG_0 @@ -447,23 +465,23 @@ DataStoreTLBMiss: * kernel page tables. */ mfspr r10, SPRN_MD_EPN - rlwinm r10, r10, 16, 0xfff8 - cmpli cr0, r10, PAGE_OFFSET@h + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TW /* Get level 1 table */ blt+ 3f + rlwinm r11, r10, 16, 0xfff8 #ifndef CONFIG_PIN_TLB_IMMR - cmpli cr0, r10, VIRT_IMMR_BASE@h + cmpli cr0, r11, VIRT_IMMR_BASE@h #endif _ENTRY(DTLBMiss_cmp) - cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h - lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha + cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h #ifndef CONFIG_PIN_TLB_IMMR _ENTRY(DTLBMiss_jmp) beq- DTLBMissIMMR #endif blt cr7, DTLBMissLinear + lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha 3: - mfspr r10, SPRN_MD_EPN /* Insert level 1 index */ rlwimi r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29 @@ -569,8 +587,8 @@ _ENTRY(DTLBMiss_jmp) InstructionTLBError: EXCEPTION_PROLOG mr r4,r12 - mr r5,r9 - andis. r10,r5,0x4000 + andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ + andis. r10,r9,SRR1_ISI_NOPT@h beq+ 1f tlbie r4 itlbie: @@ -595,7 +613,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) mfspr r4,SPRN_DAR - andis. r10,r5,0x4000 + andis. r10,r5,DSISR_NOHPTE@h beq+ 1f tlbie r4 dtlbie: @@ -684,7 +702,7 @@ DTLBMissLinear: /* Set 8M byte page and mark it valid */ li r11, MD_PS8MEG | MD_SVALID MTSPR_CPU6(SPRN_MD_TWC, r11, r3) - rlwinm r10, r10, 16, 0x0f800000 /* 8xx supports max 256Mb RAM */ + rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \ _PAGE_PRESENT MTSPR_CPU6(SPRN_MD_RPN, r10, r11) /* Update TLB entry */ @@ -695,6 +713,22 @@ DTLBMissLinear: EXCEPTION_EPILOG_0 rfi +#ifndef CONFIG_PIN_TLB_TEXT +ITLBMissLinear: + mtcr r3 + /* Set 8M byte page and mark it valid */ + li r11, MI_PS8MEG | MI_SVALID | _PAGE_EXEC + MTSPR_CPU6(SPRN_MI_TWC, r11, r3) + rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ + ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \ + _PAGE_PRESENT + MTSPR_CPU6(SPRN_MI_RPN, r10, r11) /* Update TLB entry */ + + mfspr r3, SPRN_SPRG_SCRATCH2 + EXCEPTION_EPILOG_0 + rfi +#endif + /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address. @@ -705,9 +739,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ mtspr SPRN_SPRG_SCRATCH2, r10 /* fetch instruction from memory. */ mfspr r10, SPRN_SRR0 - IS_KERNEL(r11, r10) + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TW /* Get level 1 table */ - BRANCH_UNLESS_KERNEL(3f) + blt+ 3f rlwinm r11, r10, 16, 0xfff8 _ENTRY(FixupDAR_cmp) cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h @@ -915,10 +950,8 @@ start_here: rfi /* Load up the kernel context */ 2: - SYNC /* Force all PTE updates to finish */ tlbia /* Clear all TLB entries */ sync /* wait for tlbia/tlbie to finish */ - TLBSYNC /* ... on all CPUs */ /* set up the PTE pointers for the Abatron bdiGDB. */ @@ -955,15 +988,14 @@ initial_mmu: mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -/* Always pin the first 8 MB ITLB to prevent ITLB - misses while mucking around with SRR0/SRR1 in asm -*/ +#ifdef CONFIG_PIN_TLB_TEXT lis r8, MI_RSV4I@h ori r8, r8, 0x1c00 mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ +#endif -#ifdef CONFIG_PIN_TLB +#ifdef CONFIG_PIN_TLB_DATA oris r10, r10, MD_RSV4I@h mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ #endif @@ -989,6 +1021,7 @@ initial_mmu: * internal registers (among other things). */ #ifdef CONFIG_PIN_TLB_IMMR + oris r10, r10, MD_RSV4I@h ori r10, r10, 0x1c00 mtspr SPRN_MD_CTR, r10 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index a620203f7de3..d0862a100d29 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __HEAD_BOOKE_H__ #define __HEAD_BOOKE_H__ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 516ebef905c0..01e1c1997893 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -85,7 +85,66 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) std r3,_WORT(r1) mfspr r3,SPRN_WORC std r3,_WORC(r1) +/* + * On POWER9, there are idle states such as stop4, invoked via cpuidle, + * that lose hypervisor resources. In such cases, we need to save + * additional SPRs before entering those idle states so that they can + * be restored to their older values on wakeup from the idle state. + * + * On POWER8, the only such deep idle state is winkle which is used + * only in the context of CPU-Hotplug, where these additional SPRs are + * reinitiazed to a sane value. Hence there is no need to save/restore + * these SPRs. + */ +BEGIN_FTR_SECTION + blr +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) +power9_save_additional_sprs: + mfspr r3, SPRN_PID + mfspr r4, SPRN_LDBAR + std r3, STOP_PID(r13) + std r4, STOP_LDBAR(r13) + + mfspr r3, SPRN_FSCR + mfspr r4, SPRN_HFSCR + std r3, STOP_FSCR(r13) + std r4, STOP_HFSCR(r13) + + mfspr r3, SPRN_MMCRA + mfspr r4, SPRN_MMCR0 + std r3, STOP_MMCRA(r13) + std r4, _MMCR0(r1) + + mfspr r3, SPRN_MMCR1 + mfspr r4, SPRN_MMCR2 + std r3, STOP_MMCR1(r13) + std r4, STOP_MMCR2(r13) + blr + +power9_restore_additional_sprs: + ld r3,_LPCR(r1) + ld r4, STOP_PID(r13) + mtspr SPRN_LPCR,r3 + mtspr SPRN_PID, r4 + + ld r3, STOP_LDBAR(r13) + ld r4, STOP_FSCR(r13) + mtspr SPRN_LDBAR, r3 + mtspr SPRN_FSCR, r4 + + ld r3, STOP_HFSCR(r13) + ld r4, STOP_MMCRA(r13) + mtspr SPRN_HFSCR, r3 + mtspr SPRN_MMCRA, r4 + + ld r3, _MMCR0(r1) + ld r4, STOP_MMCR1(r13) + mtspr SPRN_MMCR0, r3 + mtspr SPRN_MMCR1, r4 + + ld r3, STOP_MMCR2(r13) + mtspr SPRN_MMCR2, r3 blr /* @@ -141,7 +200,16 @@ pnv_powersave_common: std r5,_CCR(r1) std r1,PACAR1(r13) +BEGIN_FTR_SECTION /* + * POWER9 does not require real mode to stop, and presently does not + * set hwthread_state for KVM (threads don't share MMU context), so + * we can remain in virtual mode for this. + */ + bctr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* + * POWER8 * Go to real mode to do the nap, as required by the architecture. * Also, we need to be in real mode before setting hwthread_state, * because as soon as we do that, another thread can switch @@ -151,6 +219,20 @@ pnv_powersave_common: mtmsrd r7,0 bctr +/* + * This is the sequence required to execute idle instructions, as + * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. + */ +#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ + std r0,0(r1); \ + ptesync; \ + ld r0,0(r1); \ +236: cmpd cr0,r0,r0; \ + bne 236b; \ + IDLE_INST; + + .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -255,7 +337,7 @@ power_enter_stop: andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ bne .Lhandle_esl_ec_set - IDLE_STATE_ENTER_SEQ(PPC_STOP) + PPC_STOP li r3,0 /* Since we didn't lose state, return 0 */ /* @@ -273,13 +355,15 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: +BEGIN_FTR_SECTION /* - * POWER9 DD2 can incorrectly set PMAO when waking up after a - * state-loss idle. Saving and restoring MMCR0 over idle is a + * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after + * a state-loss idle. Saving and restoring MMCR0 over idle is a * workaround. */ mfspr r4,SPRN_MMCR0 std r4,_MMCR0(r1) +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) /* * Check if the requested state is a deep idle state. @@ -288,7 +372,8 @@ power_enter_stop: ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) cmpd r3,r4 bge .Lhandle_deep_stop - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP /* Does not return (system reset interrupt) */ + .Lhandle_deep_stop: /* * Entering deep idle state. @@ -310,7 +395,7 @@ lwarx_loop_stop: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP /* Does not return (system reset interrupt) */ /* * Entered with MSR[EE]=0 and no soft-masked interrupts pending. @@ -460,13 +545,21 @@ pnv_restore_hyp_resource_arch300: /* * Workaround for POWER9, if we lost resources, the ERAT * might have been mixed up and needs flushing. We also need - * to reload MMCR0 (see comment above). + * to reload MMCR0 (see comment above). We also need to set + * then clear bit 60 in MMCRA to ensure the PMU starts running. */ blt cr3,1f +BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT ld r1,PACAR1(r13) ld r4,_MMCR0(r1) mtspr SPRN_MMCR0,r4 +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) + mfspr r4,SPRN_MMCRA + ori r4,r4,(1 << (63-60)) + mtspr SPRN_MMCRA,r4 + xori r4,r4,(1 << (63-60)) + mtspr SPRN_MMCRA,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we @@ -803,9 +896,16 @@ no_segments: mtctr r12 bctrl +/* + * On POWER9, we can come here on wakeup from a cpuidle stop state. + * Hence restore the additional SPRs to the saved value. + * + * On POWER8, we come here only on winkle. Since winkle is used + * only in the case of CPU-Hotplug, we don't need to restore + * the additional SPRs. + */ BEGIN_FTR_SECTION - ld r4,_LPCR(r1) - mtspr SPRN_LPCR,r4 + bl power9_restore_additional_sprs END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index a582e0d42525..aa9f1b8261db 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -19,6 +19,8 @@ #include <asm/pgtable.h> #include <asm/ppc-pci.h> #include <asm/io-workarounds.h> +#include <asm/pte-walk.h> + #define IOWA_MAX_BUS 8 @@ -75,8 +77,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) * We won't find huge pages here (iomem). Also can't hit * a page table free due to init_mm */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(vaddr, &hugepage_shift); if (ptep == NULL) paddr = 0; else { @@ -192,7 +193,7 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, if (iowa_bus_count >= IOWA_MAX_BUS) { pr_err("IOWA:Too many pci bridges, " - "workarounds disabled for %s\n", np->full_name); + "workarounds disabled for %pOF\n", np); return; } @@ -207,6 +208,6 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, iowa_bus_count++; - pr_debug("IOWA:[%d]Add bus, %s.\n", iowa_bus_count-1, np->full_name); + pr_debug("IOWA:[%d]Add bus, %pOF.\n", iowa_bus_count-1, np); } diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c index a1854d1ded8b..aab456ed2a00 100644 --- a/arch/powerpc/kernel/iomap.c +++ b/arch/powerpc/kernel/iomap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * ppc64 "iomap" interface implementation. * diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 233ca3fe4754..af7a20dc6e09 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -127,8 +127,7 @@ static ssize_t fail_iommu_store(struct device *dev, return count; } -static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show, - fail_iommu_store); +static DEVICE_ATTR_RW(fail_iommu); static int fail_iommu_bus_notify(struct notifier_block *nb, unsigned long action, void *data) @@ -190,7 +189,7 @@ static unsigned long iommu_range_alloc(struct device *dev, unsigned int pool_nr; struct iommu_pool *pool; - align_mask = 0xffffffffffffffffl >> (64 - align_order); + align_mask = (1ull << align_order) - 1; /* This allocator was derived from x86_64's bit string search */ @@ -208,7 +207,7 @@ static unsigned long iommu_range_alloc(struct device *dev, * We don't need to disable preemption here because any CPU can * safely use any IOMMU pool. */ - pool_nr = __this_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); + pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); if (largealloc) pool = &(tbl->large_pool); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 0bcec745a672..b7a84522e652 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -24,7 +24,7 @@ * mask register (of which only 16 are defined), hence the weird shifting * and complement of the cached_irq_mask. I want to be able to stuff * this right into the SIU SMASK register. - * Many of the prep/chrp functions are conditional compiled on CONFIG_8xx + * Many of the prep/chrp functions are conditional compiled on CONFIG_PPC_8xx * to reduce code space and undefined function references. */ @@ -143,8 +143,29 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; - /* Clear bit 0 which we wouldn't clear otherwise */ - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + trace_hardirqs_on(); + trace_hardirqs_off(); + + if (happened & PACA_IRQ_HARD_DIS) { + /* Clear bit 0 which we wouldn't clear otherwise */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + + /* + * We may have missed a decrementer interrupt if hard disabled. + * Check the decrementer register in case we had a rollover + * while hard disabled. + */ + if (!(happened & PACA_IRQ_DEC)) { + if (decrementer_check_overflow()) { + local_paca->irq_happened |= PACA_IRQ_DEC; + happened |= PACA_IRQ_DEC; + } + } + } /* * Force the delivery of pending soft-disabled interrupts on PS3. @@ -160,41 +181,39 @@ notrace unsigned int __check_irq_replay(void) * This is a higher priority interrupt than the others, so * replay it first. */ - local_paca->irq_happened &= ~PACA_IRQ_HMI; - if (happened & PACA_IRQ_HMI) + if (happened & PACA_IRQ_HMI) { + local_paca->irq_happened &= ~PACA_IRQ_HMI; return 0xe60; + } - /* - * We may have missed a decrementer interrupt. We check the - * decrementer itself rather than the paca irq_happened field - * in case we also had a rollover while hard disabled - */ - local_paca->irq_happened &= ~PACA_IRQ_DEC; - if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow()) + if (happened & PACA_IRQ_DEC) { + local_paca->irq_happened &= ~PACA_IRQ_DEC; return 0x900; + } - /* Finally check if an external interrupt happened */ - local_paca->irq_happened &= ~PACA_IRQ_EE; - if (happened & PACA_IRQ_EE) + if (happened & PACA_IRQ_EE) { + local_paca->irq_happened &= ~PACA_IRQ_EE; return 0x500; + } #ifdef CONFIG_PPC_BOOK3E - /* Finally check if an EPR external interrupt happened - * this bit is typically set if we need to handle another - * "edge" interrupt from within the MPIC "EPR" handler + /* + * Check if an EPR external interrupt happened this bit is typically + * set if we need to handle another "edge" interrupt from within the + * MPIC "EPR" handler. */ - local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; - if (happened & PACA_IRQ_EE_EDGE) + if (happened & PACA_IRQ_EE_EDGE) { + local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; return 0x500; + } - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - if (happened & PACA_IRQ_DBELL) + if (happened & PACA_IRQ_DBELL) { + local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0x280; + } #else - local_paca->irq_happened &= ~PACA_IRQ_DBELL; if (happened & PACA_IRQ_DBELL) { - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0xe80; + local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0xa00; } #endif /* CONFIG_PPC_BOOK3E */ @@ -258,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en) #endif /* CONFIG_TRACE_IRQFLAGS */ set_soft_enabled(0); + trace_hardirqs_off(); /* * Check if anything needs to be re-emitted. We haven't @@ -267,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en) replay = __check_irq_replay(); /* We can soft-enable now */ + trace_hardirqs_on(); set_soft_enabled(1); /* @@ -382,11 +403,19 @@ bool prep_irq_for_idle_irqsoff(void) /* * Take the SRR1 wakeup reason, index into this table to find the * appropriate irq_happened bit. + * + * Sytem reset exceptions taken in idle state also come through here, + * but they are NMI interrupts so do not need to wait for IRQs to be + * restored, and should be taken as early as practical. These are marked + * with 0xff in the table. The Power ISA specifies 0100b as the system + * reset interrupt reason. */ +#define IRQ_SYSTEM_RESET 0xff + static const u8 srr1_to_lazyirq[0x10] = { 0, 0, 0, PACA_IRQ_DBELL, - 0, + IRQ_SYSTEM_RESET, PACA_IRQ_DBELL, PACA_IRQ_DEC, 0, @@ -395,15 +424,43 @@ static const u8 srr1_to_lazyirq[0x10] = { PACA_IRQ_HMI, 0, 0, 0, 0, 0 }; +void replay_system_reset(void) +{ + struct pt_regs regs; + + ppc_save_regs(®s); + regs.trap = 0x100; + get_paca()->in_nmi = 1; + system_reset_exception(®s); + get_paca()->in_nmi = 0; +} +EXPORT_SYMBOL_GPL(replay_system_reset); + void irq_set_pending_from_srr1(unsigned long srr1) { unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + u8 reason = srr1_to_lazyirq[idx]; + + /* + * Take the system reset now, which is immediately after registers + * are restored from idle. It's an NMI, so interrupts need not be + * re-enabled before it is taken. + */ + if (unlikely(reason == IRQ_SYSTEM_RESET)) { + replay_system_reset(); + return; + } /* * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, - * so this can be called unconditionally with srr1 wake reason. + * so this can be called unconditionally with the SRR1 wake + * reason as returned by the idle code, which uses 0 to mean no + * interrupt. + * + * If a future CPU was to designate this as an interrupt reason, + * then a new index for no interrupt must be assigned. */ - local_paca->irq_happened |= srr1_to_lazyirq[idx]; + local_paca->irq_happened |= reason; } #endif /* CONFIG_PPC_BOOK3S */ @@ -470,6 +527,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Hypervisor Maintenance Interrupts\n"); } + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs); + seq_printf(p, " System Reset interrupts\n"); + +#ifdef CONFIG_PPC_WATCHDOG + seq_printf(p, "%*s: ", prec, "WDG"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs); + seq_printf(p, " Watchdog soft-NMI interrupts\n"); +#endif + #ifdef CONFIG_PPC_DOORBELL if (cpu_has_feature(CPU_FTR_DBELL)) { seq_printf(p, "%*s: ", prec, "DBL"); @@ -494,6 +563,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += per_cpu(irq_stat, cpu).spurious_irqs; sum += per_cpu(irq_stat, cpu).timer_irqs_others; sum += per_cpu(irq_stat, cpu).hmi_exceptions; + sum += per_cpu(irq_stat, cpu).sreset_irqs; +#ifdef CONFIG_PPC_WATCHDOG + sum += per_cpu(irq_stat, cpu).soft_nmi_irqs; +#endif #ifdef CONFIG_PPC_DOORBELL sum += per_cpu(irq_stat, cpu).doorbell_irqs; #endif diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index bb6f8993412e..1df6c74aa731 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -164,7 +164,7 @@ void __init isa_bridge_find_early(struct pci_controller *hose) /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; - pr_debug("ISA bridge (early) is %s\n", np->full_name); + pr_debug("ISA bridge (early) is %pOF\n", np); } /** @@ -187,15 +187,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np) pna = of_n_addr_cells(np); if (of_property_read_u32(np, "#address-cells", &na) || of_property_read_u32(np, "#size-cells", &ns)) { - pr_warn("ISA: Non-PCI bridge %s is missing address format\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF is missing address format\n", + np); return; } /* Check it's a supported address format */ if (na != 2 || ns != 1) { - pr_warn("ISA: Non-PCI bridge %s has unsupported address format\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has unsupported address format\n", + np); return; } rs = na + ns + pna; @@ -203,8 +203,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Grab the ranges property */ ranges = of_get_property(np, "ranges", &rlen); if (ranges == NULL || rlen < rs) { - pr_warn("ISA: Non-PCI bridge %s has absent or invalid ranges\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has absent or invalid ranges\n", + np); return; } @@ -220,8 +220,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Got something ? */ if (!size || !pbasep) { - pr_warn("ISA: Non-PCI bridge %s has no usable IO range\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has no usable IO range\n", + np); return; } @@ -233,15 +233,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Map pbase */ pbase = of_translate_address(np, pbasep); if (pbase == OF_BAD_ADDR) { - pr_warn("ISA: Non-PCI bridge %s failed to translate IO base\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF failed to translate IO base\n", + np); return; } /* We need page alignment */ if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) { - pr_warn("ISA: Non-PCI bridge %s has non aligned IO range\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has non aligned IO range\n", + np); return; } @@ -255,7 +255,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) __ioremap_at(pbase, (void *)ISA_IO_BASE, size, pgprot_val(pgprot_noncached(__pgprot(0)))); - pr_debug("ISA: Non-PCI bridge is %s\n", np->full_name); + pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } /** @@ -277,8 +277,8 @@ static void isa_bridge_find_late(struct pci_dev *pdev, /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; - pr_debug("ISA bridge (late) is %s on %s\n", - devnode->full_name, pci_name(pdev)); + pr_debug("ISA bridge (late) is %pOF on %s\n", + devnode, pci_name(pdev)); } /** diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index dbf098121ce6..35e240a0a408 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -67,9 +67,9 @@ static struct hard_trap_info #endif #else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ -#if defined(CONFIG_8xx) +#if defined(CONFIG_PPC_8xx) { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ -#else /* ! CONFIG_8xx */ +#else /* ! CONFIG_PPC_8xx */ { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */ { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */ { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */ diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 6c089d9757c9..7a1f99f1b47f 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -25,6 +25,21 @@ #include <linux/preempt.h> #include <linux/ftrace.h> +/* + * This is called from ftrace code after invoking registered handlers to + * disambiguate regs->nip changes done by jprobes and livepatch. We check if + * there is an active jprobe at the provided address (mcount location). + */ +int __is_active_jprobe(unsigned long addr) +{ + if (!preemptible()) { + struct kprobe *p = raw_cpu_read(current_kprobe); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; + } + + return 0; +} + static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_nip) @@ -60,11 +75,8 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - hard_irq_disable(); + preempt_disable(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) @@ -86,13 +98,17 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) __skip_singlestep(p, regs, kcb, orig_nip); - /* - * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. - */ + else { + /* + * If pre_handler returns !0, it sets regs->nip and + * resets current kprobe. In this case, we should not + * re-enable preemption. + */ + return; + } } end: - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 367494dc67d9..ca5d5a081e75 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; -int is_current_kprobe_addr(unsigned long addr) -{ - struct kprobe *p = kprobe_running(); - return (p && (unsigned long)p->addr == addr) ? 1 : 0; -} - bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && @@ -59,7 +53,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr) kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) { - kprobe_opcode_t *addr; + kprobe_opcode_t *addr = NULL; #ifdef PPC64_ELF_ABI_v2 /* PPC64 ABIv2 needs local entry point */ @@ -91,36 +85,29 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) * Also handle <module:symbol> format. */ char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; - const char *modsym; bool dot_appended = false; - if ((modsym = strchr(name, ':')) != NULL) { - modsym++; - if (*modsym != '\0' && *modsym != '.') { - /* Convert to <module:.symbol> */ - strncpy(dot_name, name, modsym - name); - dot_name[modsym - name] = '.'; - dot_name[modsym - name + 1] = '\0'; - strncat(dot_name, modsym, - sizeof(dot_name) - (modsym - name) - 2); - dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, sizeof(dot_name) - 1); - } - } else if (name[0] != '.') { - dot_name[0] = '.'; - dot_name[1] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 2); + const char *c; + ssize_t ret = 0; + int len = 0; + + if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + c++; + len = c - name; + memcpy(dot_name, name, len); + } else + c = name; + + if (*c != '\0' && *c != '.') { + dot_name[len++] = '.'; dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 1); } - addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); - if (!addr && dot_appended) { - /* Let's try the original non-dot symbol lookup */ + ret = strscpy(dot_name + len, c, KSYM_NAME_LEN); + if (ret > 0) + addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); + + /* Fallback to the original non-dot symbol lookup */ + if (!addr && dot_appended) addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); - } #else addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); #endif @@ -239,7 +226,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); -int try_to_emulate(struct kprobe *p, struct pt_regs *regs) +static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; unsigned int insn = *p->ainsn.insn; @@ -261,9 +248,20 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs) */ printk("Can't step on instruction %x\n", insn); BUG(); - } else if (ret == 0) - /* This instruction can't be boosted */ - p->ainsn.boostable = -1; + } else { + /* + * If we haven't previously emulated this instruction, then it + * can't be boosted. Note it down so we don't try to do so again. + * + * If, however, we had emulated this instruction in the past, + * then this is just an error with the current run (for + * instance, exceptions due to a load/store). We return 0 so + * that this is now single-stepped, but continue to try + * emulating it in subsequent probe hits. + */ + if (unlikely(p->ainsn.boostable != 1)) + p->ainsn.boostable = -1; + } return ret; } @@ -600,7 +598,12 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); unsigned long arch_deref_entry_point(void *entry) { - return ppc_global_function_entry(entry); +#ifdef PPC64_ELF_ABI_v1 + if (!kernel_text_address((unsigned long)entry)) + return ppc_global_function_entry(entry); + else +#endif + return (unsigned long)entry; } NOKPROBE_SYMBOL(arch_deref_entry_point); @@ -634,24 +637,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler); void __used jprobe_return(void) { - asm volatile("trap" ::: "memory"); + asm volatile("jprobe_return_trap:\n" + "trap\n" + ::: "memory"); } NOKPROBE_SYMBOL(jprobe_return); -static void __used jprobe_return_end(void) -{ -} -NOKPROBE_SYMBOL(jprobe_return_end); - int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - /* - * FIXME - we should ideally be validating that we got here 'cos - * of the "trap" in jprobe_return() above, before restoring the - * saved regs... - */ + if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) { + pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n", + regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap")); + return 0; + } + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); /* It's OK to start function graph tracing again */ unpause_graph_tracing(); diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 1086ea37c832..9ad37f827a97 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -25,7 +25,6 @@ #include <linux/kvm_para.h> #include <linux/slab.h> #include <linux/of.h> -#include <linux/nmi.h> /* hardlockup_detector_disable() */ #include <asm/reg.h> #include <asm/sections.h> @@ -719,12 +718,6 @@ static __init void kvm_free_tmp(void) static int __init kvm_guest_init(void) { - /* - * The hardlockup detector is likely to get false positives in - * KVM guests, so disable it by default. - */ - hardlockup_detector_disable(); - if (!kvm_para_available()) goto free_tmp; diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 97ec8557f974..6408f09dbbd9 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -181,7 +181,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) mtctr r4 li r4,0 1: - lwzx r0,r0,r4 + lwzx r0,0,r4 addi r4,r4,32 /* Go to start of next cache line */ bdnz 1b isync @@ -328,7 +328,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) mtctr r4 li r4,0 1: - lwzx r0,r0,r4 + lwzx r0,0,r4 dcbf 0,r4 addi r4,r4,32 /* Go to start of next cache line */ bdnz 1b diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 0694d20f85b6..33b34a58fc62 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/serial.h> #include <linux/serial_8250.h> @@ -147,8 +148,8 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_ports[index].serial_out = tsi_serial_out; } - printk(KERN_DEBUG "Found legacy serial port %d for %s\n", - index, np->full_name); + printk(KERN_DEBUG "Found legacy serial port %d for %pOF\n", + index, np); printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n", (iotype == UPIO_PORT) ? "port" : "mem", (unsigned long long)base, (unsigned long long)taddr, irq, @@ -207,7 +208,7 @@ static int __init add_legacy_isa_port(struct device_node *np, int index = -1; u64 taddr; - DBG(" -> add_legacy_isa_port(%s)\n", np->full_name); + DBG(" -> add_legacy_isa_port(%pOF)\n", np); /* Get the ISA port number */ reg = of_get_property(np, "reg", NULL); @@ -256,7 +257,7 @@ static int __init add_legacy_pci_port(struct device_node *np, unsigned int flags; int iotype, index = -1, lindex = 0; - DBG(" -> add_legacy_pci_port(%s)\n", np->full_name); + DBG(" -> add_legacy_pci_port(%pOF)\n", np); /* We only support ports that have a clock frequency properly * encoded in the device-tree (that is have an fcode). Anything @@ -374,7 +375,7 @@ void __init find_legacy_serial_ports(void) if (path != NULL) { stdout = of_find_node_by_path(path); if (stdout) - DBG("stdout is %s\n", stdout->full_name); + DBG("stdout is %pOF\n", stdout); } else { DBG(" no linux,stdout-path !\n"); } @@ -603,7 +604,7 @@ static int __init check_legacy_serial_console(void) DBG(" can't find stdout package %s !\n", name); return -ENODEV; } - DBG("stdout is %s\n", prom_stdout->full_name); + DBG("stdout is %pOF\n", prom_stdout); name = of_get_property(prom_stdout, "name", NULL); if (!name) { diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 5c12e21d0d1a..49d34d7271e7 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -402,4 +402,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index 992c0d258e5d..e4395f937d63 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -91,11 +91,13 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) * and that value will be returned. If all free regions are visited without * func returning non-zero, then zero will be returned. */ -int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) +int arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) { int ret = 0; u64 i; phys_addr_t mstart, mend; + struct resource res = { }; if (kbuf->top_down) { for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0, @@ -105,7 +107,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) * range while in kexec, end points to the last byte * in the range. */ - ret = func(mstart, mend - 1, kbuf); + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); if (ret) break; } @@ -117,7 +121,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)) * range while in kexec, end points to the last byte * in the range. */ - ret = func(mstart, mend - 1, kbuf); + res.start = mstart; + res.end = mend - 1; + ret = func(&res, kbuf); if (ret) break; } diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index e0e131e662ed..742e4658c5dc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -22,11 +22,14 @@ #undef DEBUG #define pr_fmt(fmt) "mce: " fmt +#include <linux/hardirq.h> #include <linux/types.h> #include <linux/ptrace.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/irq_work.h> + +#include <asm/machdep.h> #include <asm/mce.h> static DEFINE_PER_CPU(int, mce_nest_count); @@ -36,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); +/* Queue for delayed MCE UE events. */ +static DEFINE_PER_CPU(int, mce_ue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], + mce_ue_event_queue); + static void machine_check_process_queued_event(struct irq_work *work); +void machine_check_ue_event(struct machine_check_event *evt); +static void machine_process_ue_event(struct work_struct *work); + static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { @@ -79,7 +92,7 @@ static void mce_set_error_info(struct machine_check_event *mce, */ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, - uint64_t nip, uint64_t addr) + uint64_t nip, uint64_t addr, uint64_t phys_addr) { int index = __this_cpu_inc_return(mce_nest_count) - 1; struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); @@ -137,6 +150,11 @@ void save_mce_event(struct pt_regs *regs, long handled, } else if (mce->error_type == MCE_ERROR_TYPE_UE) { mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; + if (phys_addr != ULONG_MAX) { + mce->u.ue_error.physical_address_provided = true; + mce->u.ue_error.physical_address = phys_addr; + machine_check_ue_event(mce); + } } return; } @@ -190,6 +208,26 @@ void release_mce_event(void) get_mce_event(NULL, true); } + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_ue_event(struct machine_check_event *evt) +{ + int index; + + index = __this_cpu_inc_return(mce_ue_count) - 1; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __this_cpu_dec(mce_ue_count); + return; + } + memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); + + /* Queue work to process this event later. */ + schedule_work(&mce_ue_event_work); +} + /* * Queue up the MCE event which then can be handled later. */ @@ -212,7 +250,39 @@ void machine_check_queue_event(void) /* Queue irq work to process this event later. */ irq_work_queue(&mce_event_process_work); } - +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_process_ue_event(struct work_struct *work) +{ + int index; + struct machine_check_event *evt; + + while (__this_cpu_read(mce_ue_count) > 0) { + index = __this_cpu_read(mce_ue_count) - 1; + evt = this_cpu_ptr(&mce_ue_event_queue[index]); +#ifdef CONFIG_MEMORY_FAILURE + /* + * This should probably queued elsewhere, but + * oh! well + */ + if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.physical_address_provided) { + unsigned long pfn; + + pfn = evt->u.ue_error.physical_address >> + PAGE_SHIFT; + memory_failure(pfn, SIGBUS, 0); + } else + pr_warn("Failed to identify bad address from " + "where the uncorrectable error (UE) " + "was generated\n"); + } +#endif + __this_cpu_dec(mce_ue_count); + } +} /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. @@ -220,6 +290,7 @@ void machine_check_queue_event(void) static void machine_check_process_queued_event(struct irq_work *work) { int index; + struct machine_check_event *evt; add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -229,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work) */ while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; - machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index]), false); + evt = this_cpu_ptr(&mce_event_queue[index]); + machine_check_print_event_info(evt, false); __this_cpu_dec(mce_queue_count); } } @@ -337,7 +408,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, printk("%s Effective address: %016llx\n", level, evt->u.ue_error.effective_address); if (evt->u.ue_error.physical_address_provided) - printk("%s Physical address: %016llx\n", + printk("%s Physical address: %016llx\n", level, evt->u.ue_error.physical_address); break; case MCE_ERROR_TYPE_SLB: @@ -408,41 +479,60 @@ void machine_check_print_event_info(struct machine_check_event *evt, } EXPORT_SYMBOL_GPL(machine_check_print_event_info); -uint64_t get_mce_fault_addr(struct machine_check_event *evt) +/* + * This function is called in real mode. Strictly no printk's please. + * + * regs->nip and regs->msr contains srr0 and ssr1. + */ +long machine_check_early(struct pt_regs *regs) { - switch (evt->error_type) { - case MCE_ERROR_TYPE_UE: - if (evt->u.ue_error.effective_address_provided) - return evt->u.ue_error.effective_address; - break; - case MCE_ERROR_TYPE_SLB: - if (evt->u.slb_error.effective_address_provided) - return evt->u.slb_error.effective_address; - break; - case MCE_ERROR_TYPE_ERAT: - if (evt->u.erat_error.effective_address_provided) - return evt->u.erat_error.effective_address; - break; - case MCE_ERROR_TYPE_TLB: - if (evt->u.tlb_error.effective_address_provided) - return evt->u.tlb_error.effective_address; - break; - case MCE_ERROR_TYPE_USER: - if (evt->u.user_error.effective_address_provided) - return evt->u.user_error.effective_address; - break; - case MCE_ERROR_TYPE_RA: - if (evt->u.ra_error.effective_address_provided) - return evt->u.ra_error.effective_address; - break; - case MCE_ERROR_TYPE_LINK: - if (evt->u.link_error.effective_address_provided) - return evt->u.link_error.effective_address; - break; - default: - case MCE_ERROR_TYPE_UNKNOWN: - break; + long handled = 0; + + __this_cpu_inc(irq_stat.mce_exceptions); + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; +} + +long hmi_exception_realmode(struct pt_regs *regs) +{ + __this_cpu_inc(irq_stat.hmi_exceptions); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ + if (pvr_version_is(PVR_POWER9)) { + unsigned long hmer = mfspr(SPRN_HMER); + + /* Do we have the debug bit set */ + if (hmer & PPC_BIT(17)) { + hmer &= ~PPC_BIT(17); + mtspr(SPRN_HMER, hmer); + + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * user space + */ + if (user_mode(regs)) + local_paca->hmi_p9_special_emu = 1; + + /* + * Don't bother going to OPAL if that's the + * only relevant bit. + */ + if (!(hmer & mfspr(SPRN_HMEER))) + return local_paca->hmi_p9_special_emu; + } } - return 0; +#endif /* CONFIG_PPC_BOOK3S_64 */ + + wait_for_subcore_guest_exit(); + + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(regs); + + wait_for_tb_resync(); + + return 1; } -EXPORT_SYMBOL(get_mce_fault_addr); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index b76ca198e09c..644f7040b91c 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,6 +27,36 @@ #include <asm/mmu.h> #include <asm/mce.h> #include <asm/machdep.h> +#include <asm/pgtable.h> +#include <asm/pte-walk.h> +#include <asm/sstep.h> +#include <asm/exception-64s.h> + +/* + * Convert an address related to an mm to a PFN. NOTE: we are in real + * mode, we could potentially race with page table updates. + */ +static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) +{ + pte_t *ptep; + unsigned long flags; + struct mm_struct *mm; + + if (user_mode(regs)) + mm = current->mm; + else + mm = &init_mm; + + local_irq_save(flags); + if (mm == current->mm) + ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); + else + ptep = find_init_mm_pte(addr, NULL); + local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) + return ULONG_MAX; + return pte_pfn(*ptep); +} static void flush_tlb_206(unsigned int num_sets, unsigned int action) { @@ -128,7 +158,7 @@ void __flush_tlb_power9(unsigned int action) { unsigned int num_sets; - if (radix_enabled()) + if (early_radix_enabled()) num_sets = POWER9_TLB_SETS_RADIX; else num_sets = POWER9_TLB_SETS_HASH; @@ -138,7 +168,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -185,7 +215,7 @@ static void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; @@ -421,9 +451,45 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0, false, 0, 0, 0, 0 } }; +static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, + uint64_t *phys_addr) +{ + /* + * Carefully look at the NIP to determine + * the instruction to analyse. Reading the NIP + * in real-mode is tricky and can lead to recursive + * faults + */ + int instr; + unsigned long pfn, instr_addr; + struct instruction_op op; + struct pt_regs tmp = *regs; + + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK); + instr = *(unsigned int *)(instr_addr); + if (!analyse_instr(&op, &tmp, instr)) { + pfn = addr_to_pfn(regs, op.ea); + *addr = op.ea; + *phys_addr = (pfn << PAGE_SHIFT); + return 0; + } + /* + * analyse_instr() might fail if the instruction + * is not a load/store, although this is unexpected + * for load/store errors or if we got the NIP + * wrong + */ + } + *addr = 0; + return -1; +} + static int mce_handle_ierror(struct pt_regs *regs, const struct mce_ierror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t srr1 = regs->msr; int handled = 0; @@ -475,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; - if (table[i].nip_valid) + if (table[i].nip_valid) { *addr = regs->nip; + if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + unsigned long pfn; + + if (get_paca()->in_mce < MAX_MCE_DEPTH) { + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + *phys_addr = + (pfn << PAGE_SHIFT); + handled = 1; + } + } + } + } return handled; } @@ -489,7 +569,8 @@ static int mce_handle_ierror(struct pt_regs *regs, static int mce_handle_derror(struct pt_regs *regs, const struct mce_derror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t dsisr = regs->dsisr; int handled = 0; @@ -555,7 +636,17 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - + else if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + /* + * We do a maximum of 4 nested MCE calls, see + * kernel/exception-64s.h + */ + if (get_paca()->in_mce < MAX_MCE_DEPTH) + if (!mce_find_instr_ea_and_pfn(regs, addr, + phys_addr)) + handled = 1; + } found = 1; } @@ -592,19 +683,21 @@ static long mce_handle_error(struct pt_regs *regs, const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; - uint64_t addr; + uint64_t addr, phys_addr; uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) - handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + handled = mce_handle_derror(regs, dtable, &mce_err, &addr, + &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr); + handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_err, regs->nip, addr); + save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); return handled; } @@ -624,5 +717,18 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs) long __machine_check_early_realmode_p9(struct pt_regs *regs) { + /* + * On POWER9 DD2.1 and below, it's possible to get a machine check + * caused by a paste instruction where only DSISR bit 25 is set. This + * will result in the MCE handler seeing an unknown event and the kernel + * crashing. An MCE that occurs like this is spurious, so we don't need + * to do anything in terms of servicing it. If there is something that + * needs to be serviced, the CPU will raise the MCE again with the + * correct DSISR so that it can be serviced properly. So detect this + * case and mark it as handled. + */ + if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000) + return 1; + return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); } diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 0b0f89685b67..759104b99f9f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { - BUG_ON(i >= num_stubs); + if (WARN_ON(i >= num_stubs)) + return 0; if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) return (unsigned long)&stubs[i]; diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 34aeac54f120..becaec990140 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -45,7 +45,7 @@ static int of_pci_phb_probe(struct platform_device *dev) if (ppc_md.pci_setup_phb == NULL) return -ENODEV; - pr_info("Setting up PCI bus %s\n", dev->dev.of_node->full_name); + pr_info("Setting up PCI bus %pOF\n", dev->dev.of_node); /* Alloc and setup PHB data structure */ phb = pcibios_alloc_controller(dev->dev.of_node); diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 6f8273f5e988..8237884ca389 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -104,8 +104,10 @@ static unsigned long can_optimize(struct kprobe *p) * and that can be emulated. */ if (!is_conditional_branch(*p->ainsn.insn) && - analyse_instr(&op, ®s, *p->ainsn.insn)) + analyse_instr(&op, ®s, *p->ainsn.insn) == 1) { + emulate_update_regs(®s, &op); nip = regs.nip; + } return nip; } @@ -113,32 +115,23 @@ static unsigned long can_optimize(struct kprobe *p) static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); - hard_irq_disable(); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { __this_cpu_write(current_kprobe, &op->kp); regs->nip = (unsigned long)op->kp.addr; - kcb->kprobe_status = KPROBE_HIT_ACTIVE; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } - /* - * No need for an explicit __hard_irq_enable() here. - * local_irq_restore() will re-enable interrupts, - * if they were hard disabled. - */ - local_irq_restore(flags); + preempt_enable_no_resched(); } NOKPROBE_SYMBOL(optimized_callback); diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index 4937bef7652f..52fc864cdec4 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -60,10 +60,6 @@ optprobe_template_entry: std r5,_CCR(r1) lbz r5,PACASOFTIRQEN(r13) std r5,SOFTE(r1) - mfdar r5 - std r5,_DAR(r1) - mfdsisr r5 - std r5,_DSISR(r1) /* * We may get here from a module, so load the kernel TOC in r2. @@ -122,10 +118,6 @@ optprobe_template_call_emulate: mtxer r5 ld r5,_CCR(r1) mtcr r5 - ld r5,_DAR(r1) - mtdar r5 - ld r5,_DSISR(r1) - mtdsisr r5 REST_GPR(0,r1) REST_10GPRS(2,r1) REST_10GPRS(12,r1) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 8d63627e067f..d6597038931d 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -90,7 +90,7 @@ static inline void free_lppacas(void) { } #endif /* CONFIG_PPC_BOOK3S */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * 3 persistent SLBs are registered here. The buffer will be zero @@ -99,18 +99,27 @@ static inline void free_lppacas(void) { } * If you make the number of persistent SLB entries dynamic, please also * update PR KVM to flush and restore them accordingly. */ -static struct slb_shadow *slb_shadow; +static struct slb_shadow * __initdata slb_shadow; static void __init allocate_slb_shadows(int nr_cpus, int limit) { int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus); + + if (early_radix_enabled()) + return; + slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit)); memset(slb_shadow, 0, size); } static struct slb_shadow * __init init_slb_shadow(int cpu) { - struct slb_shadow *s = &slb_shadow[cpu]; + struct slb_shadow *s; + + if (early_radix_enabled()) + return NULL; + + s = &slb_shadow[cpu]; /* * When we come through here to initialise boot_paca, the slb_shadow @@ -126,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu) return s; } -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ static void __init allocate_slb_shadows(int nr_cpus, int limit) { } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the @@ -161,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 new_paca->slb_shadow_ptr = init_slb_shadow(cpu); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif #ifdef CONFIG_PPC_BOOK3E /* For now -- if we have threads this will be adjusted later */ @@ -215,7 +224,7 @@ void __init allocate_pacas(void) paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit)); memset(paca, 0, paca_size); - printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n", + printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n", paca_size, nr_cpu_ids, paca); allocate_lppacas(nr_cpu_ids, limit); @@ -253,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.addr_limit); - get_paca()->addr_limit = mm->context.addr_limit; + VM_BUG_ON(!mm->context.slb_addr_limit); + get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; memcpy(&get_paca()->mm_ctx_high_slices_psize, &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); @@ -262,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; #endif -#else /* CONFIG_PPC_BOOK3S */ +#else /* !CONFIG_PPC_BOOK3S */ return; #endif } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 341a7469cab8..0ac7aa346c69 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -373,9 +373,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) if (virq) irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW); } else { - pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n", - oirq.args_count, oirq.args[0], oirq.args[1], - of_node_full_name(oirq.np)); + pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %pOF\n", + oirq.args_count, oirq.args[0], oirq.args[1], oirq.np); virq = irq_create_of_mapping(&oirq); } @@ -741,8 +740,8 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct of_pci_range range; struct of_pci_range_parser parser; - printk(KERN_INFO "PCI host bridge %s %s ranges:\n", - dev->full_name, primary ? "(primary)" : ""); + printk(KERN_INFO "PCI host bridge %pOF %s ranges:\n", + dev, primary ? "(primary)" : ""); /* Check for ranges property */ if (of_pci_range_parser_init(&parser, dev)) @@ -1556,8 +1555,8 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, if (!res->flags) { pr_debug("PCI: I/O resource not set for host" - " bridge %s (domain %d)\n", - hose->dn->full_name, hose->global_number); + " bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); } else { offset = pcibios_io_space_offset(hose); @@ -1668,7 +1667,7 @@ void pcibios_scan_phb(struct pci_controller *hose) struct device_node *node = hose->dn; int mode; - pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node)); + pr_debug("PCI: Scanning PHB %pOF\n", node); /* Get some IO space for the new PHB */ pcibios_setup_phb_io_space(hose); @@ -1741,15 +1740,3 @@ static void fixup_hide_host_resource_fsl(struct pci_dev *dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MOTOROLA, PCI_ANY_ID, fixup_hide_host_resource_fsl); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_FREESCALE, PCI_ANY_ID, fixup_hide_host_resource_fsl); - -static void fixup_vga(struct pci_dev *pdev) -{ - u16 cmd; - - pci_read_config_word(pdev, PCI_COMMAND, &cmd); - if ((cmd & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) || !vga_default_device()) - vga_set_default_device(pdev); - -} -DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_DISPLAY_VGA, 8, fixup_vga); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 41c86c6b6e4d..1d817f4d97d9 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -79,8 +79,8 @@ make_one_node_map(struct device_node* node, u8 pci_bus) return; bus_range = of_get_property(node, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, " - "assuming it starts at 0\n", node->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, " + "assuming it starts at 0\n", node); pci_to_OF_bus_map[pci_bus] = 0; } else pci_to_OF_bus_map[pci_bus] = bus_range[0]; diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index ed5e9ff61a68..15ce0306b092 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus) * to do an appropriate TLB flush here too */ if (bus->self) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct resource *res = bus->resource[0]; #endif pr_debug("IO unmapping for PCI-PCI bridge %s\n", pci_name(bus->self)); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 __flush_hash_table_range(&init_mm, res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif @@ -111,7 +111,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) if (hose->io_base_alloc == NULL) return 0; - pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name); + pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); /* This is a PHB, we fully unmap the IO area */ @@ -151,7 +151,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) hose->io_base_virt = (void __iomem *)(area->addr + hose->io_base_phys - phys_page); - pr_debug("IO mapping for PHB %s\n", hose->dn->full_name); + pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", hose->io_base_phys, hose->io_base_virt, hose->io_base_alloc); pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 592693437070..0e395afbf0f4 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -139,7 +139,6 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) #ifdef CONFIG_PCI_IOV static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, - struct pci_dev *pdev, int vf_index, int busno, int devfn) { @@ -150,10 +149,8 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return NULL; pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); - if (!pdn) { - dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__); + if (!pdn) return NULL; - } pdn->phb = parent->phb; pdn->parent = parent; @@ -167,13 +164,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, INIT_LIST_HEAD(&pdn->list); list_add_tail(&pdn->list, &parent->child_list); - /* - * If we already have PCI device instance, lets - * bind them. - */ - if (pdev) - pdev->dev.archdata.pci_data = pdn; - return pdn; } #endif @@ -201,7 +191,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { struct eeh_dev *edev __maybe_unused; - pdn = add_one_dev_pci_data(parent, NULL, i, + pdn = add_one_dev_pci_data(parent, i, pci_iov_virtfn_bus(pdev, i), pci_iov_virtfn_devfn(pdev, i)); if (!pdn) { @@ -303,7 +293,6 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, if (pdn == NULL) return NULL; dn->data = pdn; - pdn->node = dn; pdn->phb = hose; #ifdef CONFIG_PPC_POWERNV pdn->pe_number = IODA_INVALID_PE; @@ -352,6 +341,7 @@ EXPORT_SYMBOL_GPL(pci_add_device_node_info); void pci_remove_device_node_info(struct device_node *dn) { struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; + struct device_node *parent; #ifdef CONFIG_EEH struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -364,8 +354,10 @@ void pci_remove_device_node_info(struct device_node *dn) WARN_ON(!list_empty(&pdn->child_list)); list_del(&pdn->list); - if (pdn->parent) - of_node_put(pdn->parent->node); + + parent = of_get_parent(dn); + if (parent) + of_node_put(parent); dn->data = NULL; kfree(pdn); diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index ea3d98115b88..0d790f8432d2 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -211,19 +211,19 @@ void of_scan_pci_bridge(struct pci_dev *dev) unsigned int flags; u64 size; - pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); + pr_debug("of_scan_pci_bridge(%pOF)\n", node); /* parse bus-range property */ busrange = of_get_property(node, "bus-range", &len); if (busrange == NULL || len != 8) { - printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", - node->full_name); + printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %pOF\n", + node); return; } ranges = of_get_property(node, "ranges", &len); if (ranges == NULL) { - printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", - node->full_name); + printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %pOF\n", + node); return; } @@ -233,8 +233,8 @@ void of_scan_pci_bridge(struct pci_dev *dev) bus = pci_add_new_bus(dev->bus, dev, of_read_number(busrange, 1)); if (!bus) { - printk(KERN_ERR "Failed to create pci bus for %s\n", - node->full_name); + printk(KERN_ERR "Failed to create pci bus for %pOF\n", + node); return; } } @@ -262,13 +262,13 @@ void of_scan_pci_bridge(struct pci_dev *dev) res = bus->resource[0]; if (res->flags) { printk(KERN_ERR "PCI: ignoring extra I/O range" - " for bridge %s\n", node->full_name); + " for bridge %pOF\n", node); continue; } } else { if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { printk(KERN_ERR "PCI: too many memory ranges" - " for bridge %s\n", node->full_name); + " for bridge %pOF\n", node); continue; } res = bus->resource[i]; @@ -307,7 +307,7 @@ static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus, struct eeh_dev *edev = pdn_to_eeh_dev(PCI_DN(dn)); #endif - pr_debug(" * %s\n", dn->full_name); + pr_debug(" * %pOF\n", dn); if (!of_device_is_available(dn)) return NULL; @@ -350,8 +350,8 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, struct device_node *child; struct pci_dev *dev; - pr_debug("of_scan_bus(%s) bus no %d...\n", - node->full_name, bus->number); + pr_debug("of_scan_bus(%pOF) bus no %d...\n", + node, bus->number); /* Scan direct children */ for_each_child_of_node(node, child) { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 9f3e2c932dcc..bfdd783e3916 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -77,6 +77,13 @@ extern unsigned long _get_SP(void); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Are we running in "Suspend disabled" mode? If so we have to block any + * sigreturn that would get us into suspended state, and we also warn in some + * other paths that we should never reach with suspend disabled. + */ +bool tm_suspend_disabled __ro_after_init = false; + static void check_if_tm_restore_required(struct task_struct *tsk) { /* @@ -97,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr) { return MSR_TM_ACTIVE(msr); } + +static bool tm_active_with_fp(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_FP); +} + +static bool tm_active_with_altivec(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_VEC); +} #else static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } +static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } +static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ bool strict_msr_control; @@ -230,8 +251,9 @@ void enable_kernel_fp(void) } EXPORT_SYMBOL(enable_kernel_fp); -static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { +static int restore_fp(struct task_struct *tsk) +{ + if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -313,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); static int restore_altivec(struct task_struct *tsk) { if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) { + (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load_vec++; @@ -330,11 +352,19 @@ static inline int restore_altivec(struct task_struct *tsk) { return 0; } #ifdef CONFIG_VSX static void __giveup_vsx(struct task_struct *tsk) { - if (tsk->thread.regs->msr & MSR_FP) + unsigned long msr = tsk->thread.regs->msr; + + /* + * We should never be ssetting MSR_VSX without also setting + * MSR_FP and MSR_VEC + */ + WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC))); + + /* __giveup_fpu will clear MSR_VSX */ + if (msr & MSR_FP) __giveup_fpu(tsk); - if (tsk->thread.regs->msr & MSR_VEC) + if (msr & MSR_VEC) __giveup_altivec(tsk); - tsk->thread.regs->msr &= ~MSR_VSX; } static void giveup_vsx(struct task_struct *tsk) @@ -346,14 +376,6 @@ static void giveup_vsx(struct task_struct *tsk) msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); } -static void save_vsx(struct task_struct *tsk) -{ - if (tsk->thread.regs->msr & MSR_FP) - save_fpu(tsk); - if (tsk->thread.regs->msr & MSR_VEC) - save_altivec(tsk); -} - void enable_kernel_vsx(void) { unsigned long cpumsr; @@ -362,7 +384,8 @@ void enable_kernel_vsx(void) cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX); - if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) { + if (current->thread.regs && + (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) { check_if_tm_restore_required(current); /* * If a thread has already been reclaimed then the @@ -373,10 +396,6 @@ void enable_kernel_vsx(void) */ if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) return; - if (current->thread.regs->msr & MSR_FP) - __giveup_fpu(current); - if (current->thread.regs->msr & MSR_VEC) - __giveup_altivec(current); __giveup_vsx(current); } } @@ -386,7 +405,7 @@ void flush_vsx_to_thread(struct task_struct *tsk) { if (tsk->thread.regs) { preempt_disable(); - if (tsk->thread.regs->msr & MSR_VSX) { + if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) { BUG_ON(tsk != current); giveup_vsx(tsk); } @@ -406,7 +425,6 @@ static int restore_vsx(struct task_struct *tsk) } #else static inline int restore_vsx(struct task_struct *tsk) { return 0; } -static inline void save_vsx(struct task_struct *tsk) { } #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -486,6 +504,8 @@ void giveup_all(struct task_struct *tsk) msr_check_and_set(msr_all_available); check_if_tm_restore_required(tsk); + WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); + #ifdef CONFIG_PPC_FPU if (usermsr & MSR_FP) __giveup_fpu(tsk); @@ -494,10 +514,6 @@ void giveup_all(struct task_struct *tsk) if (usermsr & MSR_VEC) __giveup_altivec(tsk); #endif -#ifdef CONFIG_VSX - if (usermsr & MSR_VSX) - __giveup_vsx(tsk); -#endif #ifdef CONFIG_SPE if (usermsr & MSR_SPE) __giveup_spe(tsk); @@ -511,10 +527,6 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; - /* - * Syscall exit makes a similar initial check before branching - * to restore_math. Keep them in synch. - */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; @@ -556,19 +568,13 @@ void save_all(struct task_struct *tsk) msr_check_and_set(msr_all_available); - /* - * Saving the way the register space is in hardware, save_vsx boils - * down to a save_fpu() and save_altivec() - */ - if (usermsr & MSR_VSX) { - save_vsx(tsk); - } else { - if (usermsr & MSR_FP) - save_fpu(tsk); + WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); - if (usermsr & MSR_VEC) - save_altivec(tsk); - } + if (usermsr & MSR_FP) + save_fpu(tsk); + + if (usermsr & MSR_VEC) + save_altivec(tsk); if (usermsr & MSR_SPE) __giveup_spe(tsk); @@ -868,6 +874,10 @@ static void tm_reclaim_thread(struct thread_struct *thr, if (!MSR_TM_SUSPENDED(mfmsr())) return; + giveup_all(container_of(thr, struct task_struct, thread)); + + tm_reclaim(thr, cause); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -886,10 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, if ((thr->ckpt_regs.msr & MSR_VEC) == 0) memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - - giveup_all(container_of(thr, struct task_struct, thread)); - - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } void tm_reclaim_current(uint8_t cause) @@ -918,6 +924,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk) if (!MSR_TM_ACTIVE(thr->regs->msr)) goto out_and_saveregs; + WARN_ON(tm_suspend_disabled); + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " "ccr=%lx, msr=%lx, trap=%lx)\n", tsk->pid, thr->regs->nip, @@ -938,11 +946,9 @@ out_and_saveregs: tm_save_sprs(thr); } -extern void __tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void __tm_recheckpoint(struct thread_struct *thread); -void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr) +void tm_recheckpoint(struct thread_struct *thread) { unsigned long flags; @@ -961,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread, */ tm_restore_sprs(thread); - __tm_recheckpoint(thread, orig_msr); + __tm_recheckpoint(thread); local_irq_restore(flags); } static inline void tm_recheckpoint_new_task(struct task_struct *new) { - unsigned long msr; - if (!cpu_has_feature(CPU_FTR_TM)) return; @@ -988,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new) tm_restore_sprs(&new->thread); return; } - msr = new->thread.ckpt_regs.msr; /* Recheckpoint to restore original checkpointed register state. */ - TM_DEBUG("*** tm_recheckpoint of pid %d " - "(new->msr 0x%lx, new->origmsr 0x%lx)\n", - new->pid, new->thread.regs->msr, msr); + TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n", + new->pid, new->thread.regs->msr); - tm_recheckpoint(&new->thread, msr); + tm_recheckpoint(&new->thread); /* * The checkpointed state has been restored but the live state has @@ -1134,6 +1136,10 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (old_thread->tar != new_thread->tar) mtspr(SPRN_TAR, new_thread->tar); } + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + old_thread->tidr != new_thread->tidr) + mtspr(SPRN_TIDR, new_thread->tidr); #endif } @@ -1170,7 +1176,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1178,7 +1184,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1224,7 +1230,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1238,22 +1244,22 @@ struct task_struct *__switch_to(struct task_struct *prev, * The copy-paste buffer can only store into foreign real * addresses, so unprivileged processes can not see the * data or use it in any way unless they have foreign real - * mappings. We don't have a VAS driver that allocates those - * yet, so no cpabort is required. + * mappings. If the new process has the foreign real address + * mappings, we must issue a cp_abort to clear any state and + * prevent snooping, corruption or a covert channel. + * + * DD1 allows paste into normal system memory so we do an + * unpaired copy, rather than cp_abort, to clear the buffer, + * since cp_abort is quite expensive. */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - /* - * DD1 allows paste into normal system memory, so we - * do an unpaired copy here to clear the buffer and - * prevent a covert channel being set up. - * - * cpabort is not used because it is quite expensive. - */ + if (current_thread_info()->task->thread.used_vas) { + asm volatile(PPC_CP_ABORT); + } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { asm volatile(PPC_COPY(%0, %1) : : "r"(dummy_copy_buffer), "r"(0)); } } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ return last; } @@ -1395,13 +1401,13 @@ void show_regs(struct pt_regs * regs) show_regs_print_info(KERN_DEFAULT); - printk("NIP: "REG" LR: "REG" CTR: "REG"\n", + printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", regs, regs->trap, print_tainted(), init_utsname()->release); - printk("MSR: "REG" ", regs->msr); + printk("MSR: "REG" ", regs->msr); print_msr_bits(regs->msr); - printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); + pr_cont(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); trap = TRAP(regs); if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) pr_cont("CFAR: "REG" ", regs->orig_gpr3); @@ -1449,6 +1455,137 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +int set_thread_uses_vas(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + current->thread.used_vas = 1; + + /* + * Even a process that has no foreign real address mapping can use + * an unpaired COPY instruction (to no real effect). Issue CP_ABORT + * to clear any pending COPY and prevent a covert channel. + * + * __switch_to() will issue CP_ABORT on future context switches. + */ + asm volatile(PPC_CP_ABORT); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + return 0; +} + +#ifdef CONFIG_PPC64 +static DEFINE_SPINLOCK(vas_thread_id_lock); +static DEFINE_IDA(vas_thread_ida); + +/* + * We need to assign a unique thread id to each thread in a process. + * + * This thread id, referred to as TIDR, and separate from the Linux's tgid, + * is intended to be used to direct an ASB_Notify from the hardware to the + * thread, when a suitable event occurs in the system. + * + * One such event is a "paste" instruction in the context of Fast Thread + * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard + * (VAS) in POWER9. + * + * To get a unique TIDR per process we could simply reuse task_pid_nr() but + * the problem is that task_pid_nr() is not yet available copy_thread() is + * called. Fixing that would require changing more intrusive arch-neutral + * code in code path in copy_process()?. + * + * Further, to assign unique TIDRs within each process, we need an atomic + * field (or an IDR) in task_struct, which again intrudes into the arch- + * neutral code. So try to assign globally unique TIDRs for now. + * + * NOTE: TIDR 0 indicates that the thread does not need a TIDR value. + * For now, only threads that expect to be notified by the VAS + * hardware need a TIDR value and we assign values > 0 for those. + */ +#define MAX_THREAD_CONTEXT ((1 << 16) - 1) +static int assign_thread_tidr(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&vas_thread_id_lock); + err = ida_get_new_above(&vas_thread_ida, 1, &index); + spin_unlock(&vas_thread_id_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_THREAD_CONTEXT) { + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, index); + spin_unlock(&vas_thread_id_lock); + return -ENOMEM; + } + + return index; +} + +static void free_thread_tidr(int id) +{ + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, id); + spin_unlock(&vas_thread_id_lock); +} + +/* + * Clear any TIDR value assigned to this thread. + */ +void clear_thread_tidr(struct task_struct *t) +{ + if (!t->thread.tidr) + return; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN_ON_ONCE(1); + return; + } + + mtspr(SPRN_TIDR, 0); + free_thread_tidr(t->thread.tidr); + t->thread.tidr = 0; +} + +void arch_release_task_struct(struct task_struct *t) +{ + clear_thread_tidr(t); +} + +/* + * Assign a unique TIDR (thread id) for task @t and set it in the thread + * structure. For now, we only support setting TIDR for 'current' task. + */ +int set_thread_tidr(struct task_struct *t) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + if (t != current) + return -EINVAL; + + t->thread.tidr = assign_thread_tidr(); + if (t->thread.tidr < 0) + return t->thread.tidr; + + mtspr(SPRN_TIDR, t->thread.tidr); + + return 0; +} + +#endif /* CONFIG_PPC64 */ + void release_thread(struct task_struct *t) { @@ -1482,7 +1619,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -1595,6 +1732,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, } if (cpu_has_feature(CPU_FTR_HAS_PPR)) p->thread.ppr = INIT_PPR; + + p->thread.tidr = 0; #endif kregs->nip = ppc_function_entry(f); return 0; @@ -1913,7 +2052,8 @@ unsigned long get_wchan(struct task_struct *p) do { sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || + p->state == TASK_RUNNING) return 0; if (count > 0) { ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; @@ -1994,11 +2134,25 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) void notrace __ppc64_runlatch_on(void) { struct thread_info *ti = current_thread_info(); - unsigned long ctrl; - ctrl = mfspr(SPRN_CTRLF); - ctrl |= CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* + * Least significant bit (RUN) is the only writable bit of + * the CTRL register, so we can avoid mfspr. 2.06 is not the + * earliest ISA where this is the case, but it's convenient. + */ + mtspr(SPRN_CTRLT, CTRL_RUNLATCH); + } else { + unsigned long ctrl; + + /* + * Some architectures (e.g., Cell) have writable fields other + * than RUN, so do the read-modify-write. + */ + ctrl = mfspr(SPRN_CTRLF); + ctrl |= CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + } ti->local_flags |= _TLF_RUNLATCH; } @@ -2007,13 +2161,18 @@ void notrace __ppc64_runlatch_on(void) void notrace __ppc64_runlatch_off(void) { struct thread_info *ti = current_thread_info(); - unsigned long ctrl; ti->local_flags &= ~_TLF_RUNLATCH; - ctrl = mfspr(SPRN_CTRLF); - ctrl &= ~CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + mtspr(SPRN_CTRLT, 0); + } else { + unsigned long ctrl; + + ctrl = mfspr(SPRN_CTRLF); + ctrl &= ~CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + } } #endif /* CONFIG_PPC64 */ @@ -2042,7 +2201,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long base = mm->brk; unsigned long ret; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * If we are using 1TB segments and we are allowed to randomise * the heap, we can put it above 1TB so it is backed by a 1TB diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f83056297441..b15bae265c90 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -47,6 +47,7 @@ #include <asm/mmu.h> #include <asm/paca.h> #include <asm/pgtable.h> +#include <asm/powernv.h> #include <asm/iommu.h> #include <asm/btext.h> #include <asm/sections.h> @@ -228,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node) ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; @@ -658,6 +659,38 @@ static void __init early_reserve_mem(void) #endif } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static bool tm_disabled __initdata; + +static int __init parse_ppc_tm(char *str) +{ + bool res; + + if (kstrtobool(str, &res)) + return -EINVAL; + + tm_disabled = !res; + + return 0; +} +early_param("ppc_tm", parse_ppc_tm); + +static void __init tm_init(void) +{ + if (tm_disabled) { + pr_info("Disabling hardware transactional memory (HTM)\n"); + cur_cpu_spec->cpu_user_features2 &= + ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM); + cur_cpu_spec->cpu_features &= ~CPU_FTR_TM; + return; + } + + pnv_tm_init(); +} +#else +static void tm_init(void) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -767,6 +800,8 @@ void __init early_init_devtree(void *params) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + tm_init(); + DBG(" <- early_init_devtree()\n"); } diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 613f79f03877..02190e90c7ae 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -177,6 +177,7 @@ struct platform_support { bool hash_mmu; bool radix_mmu; bool radix_gtse; + bool xive; }; /* Platforms codes are now obsolete in the kernel. Now only used within this @@ -1041,6 +1042,27 @@ static void __init prom_parse_mmu_model(u8 val, } } +static void __init prom_parse_xive_model(u8 val, + struct platform_support *support) +{ + switch (val) { + case OV5_FEAT(OV5_XIVE_EITHER): /* Either Available */ + prom_debug("XIVE - either mode supported\n"); + support->xive = true; + break; + case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */ + prom_debug("XIVE - exploitation mode supported\n"); + support->xive = true; + break; + case OV5_FEAT(OV5_XIVE_LEGACY): /* Only Legacy mode */ + prom_debug("XIVE - legacy mode supported\n"); + break; + default: + prom_debug("Unknown xive support option: 0x%x\n", val); + break; + } +} + static void __init prom_parse_platform_support(u8 index, u8 val, struct platform_support *support) { @@ -1054,6 +1076,10 @@ static void __init prom_parse_platform_support(u8 index, u8 val, support->radix_gtse = true; } break; + case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */ + prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT), + support); + break; } } @@ -1062,7 +1088,8 @@ static void __init prom_check_platform_support(void) struct platform_support supported = { .hash_mmu = false, .radix_mmu = false, - .radix_gtse = false + .radix_gtse = false, + .xive = false }; int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); @@ -1095,6 +1122,11 @@ static void __init prom_check_platform_support(void) /* We're probably on a legacy hypervisor */ prom_debug("Assuming legacy hash support\n"); } + + if (supported.xive) { + prom_debug("Asking for XIVE\n"); + ibm_architecture_vec.vec5.intarch = OV5_FEAT(OV5_XIVE_EXPLOIT); + } } static void __init prom_send_capabilities(void) diff --git a/arch/powerpc/kernel/prom_parse.c b/arch/powerpc/kernel/prom_parse.c index 6295e646f78c..9cb7f88df563 100644 --- a/arch/powerpc/kernel/prom_parse.c +++ b/arch/powerpc/kernel/prom_parse.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #undef DEBUG #include <linux/kernel.h> diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 925a4ef90559..f52ad5bb7109 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -127,12 +127,19 @@ static void flush_tmregs_to_thread(struct task_struct *tsk) * If task is not current, it will have been flushed already to * it's thread_struct during __switch_to(). * - * A reclaim flushes ALL the state. + * A reclaim flushes ALL the state or if not in TM save TM SPRs + * in the appropriate thread structures from live. */ - if (tsk == current && MSR_TM_SUSPENDED(mfmsr())) - tm_reclaim_current(TM_CAUSE_SIGNAL); + if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) + return; + if (MSR_TM_SUSPENDED(mfmsr())) { + tm_reclaim_current(TM_CAUSE_SIGNAL); + } else { + tm_enable(); + tm_save_sprs(&(tsk->thread)); + } } #else static inline void flush_tmregs_to_thread(struct task_struct *tsk) { } @@ -1587,11 +1594,8 @@ static int ppr_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); } static int ppr_set(struct task_struct *target, @@ -1599,11 +1603,8 @@ static int ppr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); } static int dscr_get(struct task_struct *target, @@ -1611,22 +1612,16 @@ static int dscr_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); } static int dscr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); } #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -1635,22 +1630,16 @@ static int tar_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); } static int tar_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); } static int ebb_active(struct task_struct *target, diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S index d88736fbece6..e8cfc69f59ae 100644 --- a/arch/powerpc/kernel/reloc_64.S +++ b/arch/powerpc/kernel/reloc_64.S @@ -82,7 +82,7 @@ _GLOBAL(relocate) 6: blr .balign 8 -p_dyn: .llong __dynamic_start - 0b -p_rela: .llong __rela_dyn_start - 0b -p_st: .llong _stext - 0b +p_dyn: .8byte __dynamic_start - 0b +p_rela: .8byte __rela_dyn_start - 0b +p_st: .8byte _stext - 0b diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index df56dfc4b681..c8c5f3a550c2 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2000 Tilmann Bitterberg * (tilmann@bitterberg.de) diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c index c57c19358a26..49600985c7ef 100644 --- a/arch/powerpc/kernel/rtas-rtc.c +++ b/arch/powerpc/kernel/rtas-rtc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/time.h> #include <linux/timer.h> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index b8a4987f58cf..3f1c4fcbe0aa 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -78,7 +78,7 @@ static unsigned long lock_rtas(void) local_irq_save(flags); preempt_disable(); - arch_spin_lock_flags(&rtas.lock, flags); + arch_spin_lock(&rtas.lock); return flags; } @@ -914,7 +914,7 @@ int rtas_online_cpus_mask(cpumask_var_t cpus) if (ret) { cpumask_var_t tmp_mask; - if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY)) + if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL)) return ret; /* Use tmp_mask to preserve cpus mask from first failure */ @@ -962,7 +962,7 @@ int rtas_ibm_suspend_me(u64 handle) return -EIO; } - if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + if (!alloc_cpumask_var(&offline_mask, GFP_KERNEL)) return -ENOMEM; atomic_set(&data.working, 0); diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 73f1934582c2..c2b148b1634a 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -91,26 +91,14 @@ static int rtas_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { - struct device_node *busdn, *dn; struct pci_dn *pdn; - bool found = false; int ret; - /* Search only direct children of the bus */ *val = 0xFFFFFFFF; - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) { - found = true; - break; - } - } - if (!found) - return PCIBIOS_DEVICE_NOT_FOUND; + pdn = pci_get_pdn_by_devfn(bus, devfn); + /* Validity of pdn is checked in here */ ret = rtas_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) @@ -153,24 +141,11 @@ static int rtas_pci_write_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct device_node *busdn, *dn; struct pci_dn *pdn; - bool found = false; - - /* Search only direct children of the bus */ - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) { - found = true; - break; - } - } - if (!found) - return PCIBIOS_DEVICE_NOT_FOUND; + pdn = pci_get_pdn_by_devfn(bus, devfn); + /* Validity of pdn is checked in here. */ return rtas_write_config(pdn, where, size, val); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 94a948207cd2..2075322cd225 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -481,7 +481,7 @@ void __init smp_setup_cpu_maps(void) __be32 cpu_be; int j, len; - DBG(" * %s...\n", dn->full_name); + DBG(" * %pOF...\n", dn); intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); @@ -551,7 +551,7 @@ void __init smp_setup_cpu_maps(void) if (maxcpus > nr_cpu_ids) { printk(KERN_WARNING "Partition configured for %d cpus, " - "operating system maximum is %d.\n", + "operating system maximum is %u.\n", maxcpus, nr_cpu_ids); maxcpus = nr_cpu_ids; } else @@ -704,30 +704,6 @@ int check_legacy_ioport(unsigned long base_port) } EXPORT_SYMBOL(check_legacy_ioport); -static int ppc_panic_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - /* - * If firmware-assisted dump has been registered then trigger - * firmware-assisted dump and let firmware handle everything else. - */ - crash_fadump(NULL, ptr); - ppc_md.panic(ptr); /* May not return */ - return NOTIFY_DONE; -} - -static struct notifier_block ppc_panic_block = { - .notifier_call = ppc_panic_event, - .priority = INT_MIN /* may not return; must be done last */ -}; - -void __init setup_panic(void) -{ - if (!ppc_md.panic) - return; - atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); -} - #ifdef CONFIG_CHECK_CACHE_COHERENCY /* * For platforms that have configurable cache-coherency. This function @@ -797,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); #endif #ifdef CONFIG_PPC_STD_MMU_32 @@ -824,7 +800,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (htab_address) pr_info("htab_address = 0x%p\n", htab_address); if (htab_hash_mask) @@ -872,9 +848,6 @@ void __init setup_arch(char **cmdline_p) /* Probe the machine type, establish ppc_md. */ probe_machine(); - /* Setup panic notifier if requested by the platform. */ - setup_panic(); - /* * Configure ppc_md.power_save (ppc32 only, 64-bit machines do * it from their respective probe() function. @@ -916,13 +889,6 @@ void __init setup_arch(char **cmdline_p) /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - /* - * Reserve any gigantic pages requested on the command line. - * memblock needs to have been initialized by the time this is - * called since this will reserve memory. - */ - reserve_hugetlb_gpages(); - klp_init_thread_info(&init_thread_info); init_mm.start_code = (unsigned long)_stext; @@ -932,15 +898,13 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PPC_MM_SLICES #ifdef CONFIG_PPC64 - init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!radix_enabled()) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; #else #error "context.addr_limit not initialized." #endif #endif -#ifdef CONFIG_PPC_64K_PAGES - init_mm.context.pte_frag = NULL; -#endif #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index cfba134b3024..21c18071d9d5 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -45,6 +45,12 @@ void emergency_stack_init(void); static inline void emergency_stack_init(void) { }; #endif +#ifdef CONFIG_PPC64 +void record_spr_defaults(void); +#else +static inline void record_spr_defaults(void) { }; +#endif + /* * Having this in kvm_ppc.h makes include dependencies too * tricky to solve for setup-common.c so have it here. diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 2f88f6cf1a42..51ebc01fff52 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -98,6 +98,9 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { + unsigned int *addr = &memset_nocache_branch; + unsigned long insn; + /* Configure static keys first, now that we're relocated. */ setup_feature_keys(); @@ -105,7 +108,9 @@ notrace void __init machine_init(u64 dt_ptr) udbg_early_init(); patch_instruction((unsigned int *)&memcpy, PPC_INST_NOP); - patch_instruction(&memset_nocache_branch, PPC_INST_NOP); + + insn = create_cond_branch(addr, branch_target(addr), 0x820000); + patch_instruction(addr, insn); /* replace b by bne cr0 */ /* Do some early initialization based on the flat device tree */ early_init_devtree(__va(dt_ptr)); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index af23d4b576ec..8956a9856604 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -69,6 +69,8 @@ #include <asm/opal.h> #include <asm/cputhreads.h> +#include "setup.h" + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -317,6 +319,13 @@ void __init early_setup(unsigned long dt_ptr) early_init_mmu(); /* + * After firmware and early platform setup code has set things up, + * we note the SPR values for configurable control/performance + * registers, and use those as initial defaults. + */ + record_spr_defaults(); + + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to * have IR and DR set and enable AIL if it exists @@ -360,8 +369,16 @@ void early_setup_secondary(void) #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) static bool use_spinloop(void) { - if (!IS_ENABLED(CONFIG_PPC_BOOK3E)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { + /* + * See comments in head_64.S -- not all platforms insert + * secondaries at __secondary_hold and wait at the spin + * loop. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + return false; return true; + } /* * When book3e boots from kexec, the ePAPR spin table does @@ -564,6 +581,9 @@ static __init u64 safe_stack_limit(void) /* Other BookE, we assume the first GB is bolted */ return 1ul << 30; #else + if (early_radix_enabled()) + return ULONG_MAX; + /* BookS, the first segment is bolted */ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) return 1UL << SID_SHIFT_1T; @@ -578,7 +598,8 @@ void __init irqstack_early_init(void) /* * Interrupt stacks must be in the first segment since we - * cannot afford to take SLB misses on them. + * cannot afford to take SLB misses on them. They are not + * accessed in realmode. */ for_each_possible_cpu(i) { softirq_ctx[i] = (struct thread_info *) @@ -649,8 +670,9 @@ void __init emergency_stack_init(void) * aligned. * * Since we use these as temporary stacks during secondary CPU - * bringup, we need to get at them in real mode. This means they - * must also be within the RMO region. + * bringup, machine check, system reset, and HMI, we need to get + * at them in real mode. This means they must also be within the RMO + * region. * * The IRQ stacks allocated elsewhere in this file are zeroed and * initialized in kernel/irq.c. These are initialized here in order @@ -751,3 +773,31 @@ unsigned long memory_block_size_bytes(void) struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif + +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + return ppc_proc_freq * watchdog_thresh; +} +#endif + +/* + * The perf based hardlockup detector breaks PMU event based branches, so + * disable it by default. Book3S has a soft-nmi hardlockup detector based + * on the decrementer interrupt, so it does not suffer from this problem. + * + * It is likely to get false positives in VM guests, so disable it there + * by default too. + */ +static int __init disable_hardlockup_detector(void) +{ +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF + hardlockup_detector_disable(); +#else + if (firmware_has_feature(FW_FEATURE_LPAR)) + hardlockup_detector_disable(); +#endif + + return 0; +} +early_initcall(disable_hardlockup_detector); diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e9436c5e1e09..3d7539b90010 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -103,7 +103,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, static void do_signal(struct task_struct *tsk) { sigset_t *oldset = sigmask_to_save(); - struct ksignal ksig; + struct ksignal ksig = { .sig = 0 }; int ret; int is32 = is_32bit_task(); diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 97bb1385e771..9ffd73296f64 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -94,40 +94,13 @@ */ static inline int put_sigset_t(compat_sigset_t __user *uset, sigset_t *set) { - compat_sigset_t cset; - - switch (_NSIG_WORDS) { - case 4: cset.sig[6] = set->sig[3] & 0xffffffffull; - cset.sig[7] = set->sig[3] >> 32; - case 3: cset.sig[4] = set->sig[2] & 0xffffffffull; - cset.sig[5] = set->sig[2] >> 32; - case 2: cset.sig[2] = set->sig[1] & 0xffffffffull; - cset.sig[3] = set->sig[1] >> 32; - case 1: cset.sig[0] = set->sig[0] & 0xffffffffull; - cset.sig[1] = set->sig[0] >> 32; - } - return copy_to_user(uset, &cset, sizeof(*uset)); + return put_compat_sigset(uset, set, sizeof(*uset)); } static inline int get_sigset_t(sigset_t *set, const compat_sigset_t __user *uset) { - compat_sigset_t s32; - - if (copy_from_user(&s32, uset, sizeof(*uset))) - return -EFAULT; - - /* - * Swap the 2 words of the 64-bit sigset_t (they are stored - * in the "wrong" endian in 32-bit user storage). - */ - switch (_NSIG_WORDS) { - case 4: set->sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); - case 3: set->sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32); - case 2: set->sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32); - case 1: set->sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); - } - return 0; + return get_compat_sigset(set, uset); } #define to_user_ptr(p) ptr_to_compat(p) @@ -519,6 +492,8 @@ static int save_tm_user_regs(struct pt_regs *regs, { unsigned long msr = regs->msr; + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -769,6 +744,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, int i; #endif + if (tm_suspend_disabled) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal. @@ -876,7 +853,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, /* Make sure the transaction is marked as failed */ current->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(¤t->thread, msr); + tm_recheckpoint(¤t->thread); /* This loads the speculative FP/VEC state, if used */ msr_check_and_set(msr & (MSR_FP | MSR_VEC)); @@ -913,42 +890,40 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) */ err = __put_user(s->si_signo, &d->si_signo); err |= __put_user(s->si_errno, &d->si_errno); - err |= __put_user((short)s->si_code, &d->si_code); + err |= __put_user(s->si_code, &d->si_code); if (s->si_code < 0) err |= __copy_to_user(&d->_sifields._pad, &s->_sifields._pad, SI_PAD_SIZE32); - else switch(s->si_code >> 16) { - case __SI_CHLD >> 16: + else switch(siginfo_layout(s->si_signo, s->si_code)) { + case SIL_CHLD: err |= __put_user(s->si_pid, &d->si_pid); err |= __put_user(s->si_uid, &d->si_uid); err |= __put_user(s->si_utime, &d->si_utime); err |= __put_user(s->si_stime, &d->si_stime); err |= __put_user(s->si_status, &d->si_status); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __put_user((unsigned int)(unsigned long)s->si_addr, &d->si_addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(s->si_band, &d->si_band); err |= __put_user(s->si_fd, &d->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __put_user(s->si_tid, &d->si_tid); err |= __put_user(s->si_overrun, &d->si_overrun); err |= __put_user(s->si_int, &d->si_int); break; - case __SI_SYS >> 16: + case SIL_SYS: err |= __put_user(ptr_to_compat(s->si_call_addr), &d->si_call_addr); err |= __put_user(s->si_syscall, &d->si_syscall); err |= __put_user(s->si_arch, &d->si_arch); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(s->si_int, &d->si_int); /* fallthrough */ - case __SI_KILL >> 16: - default: + case SIL_KILL: err |= __put_user(s->si_pid, &d->si_pid); err |= __put_user(s->si_uid, &d->si_uid); break; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index c83c115858c1..4b9ca3570344 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, BUG_ON(tsk != current); + if (tm_suspend_disabled) + return -EINVAL; + /* copy the GPRs */ err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs, @@ -452,9 +457,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, if (MSR_TM_RESV(msr)) return -EINVAL; - /* pull in MSR TM from user context */ + /* pull in MSR TS bits from user context */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + /* + * Ensure that TM is enabled in regs->msr before we leave the signal + * handler. It could be the case that (a) user disabled the TM bit + * through the manipulation of the MSR bits in uc_mcontext or (b) the + * TM bit was disabled because a sufficient number of context switches + * happened whilst in the signal handler and load_tm overflowed, + * disabling the TM bit. In either case we can end up with an illegal + * TM state leading to a TM Bad Thing when we return to userspace. + */ + regs->msr |= MSR_TM; + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); @@ -547,7 +563,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, /* Make sure the transaction is marked as failed */ tsk->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(&tsk->thread, msr); + tm_recheckpoint(&tsk->thread); msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c index 7a37ecd3afa3..21c39355b25e 100644 --- a/arch/powerpc/kernel/smp-tbsync.c +++ b/arch/powerpc/kernel/smp-tbsync.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Smp timebase synchronization for ppc. * diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 997c88d54acf..e0a4c1f82e25 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -75,9 +75,11 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; struct thread_info *secondary_ti; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* SMP operations for this machine */ @@ -351,7 +353,7 @@ static void nmi_ipi_lock_start(unsigned long *flags) hard_irq_disable(); while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { raw_local_irq_restore(*flags); - cpu_relax(); + spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); raw_local_irq_save(*flags); hard_irq_disable(); } @@ -360,7 +362,7 @@ static void nmi_ipi_lock_start(unsigned long *flags) static void nmi_ipi_lock(void) { while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) - cpu_relax(); + spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); } static void nmi_ipi_unlock(void) @@ -475,7 +477,7 @@ int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) nmi_ipi_lock_start(&flags); while (nmi_ipi_busy_count) { nmi_ipi_unlock_end(&flags); - cpu_relax(); + spin_until_cond(nmi_ipi_busy_count == 0); nmi_ipi_lock_start(&flags); } @@ -571,6 +573,26 @@ static void smp_store_cpu_info(int id) #endif } +/* + * Relationships between CPUs are maintained in a set of per-cpu cpumasks so + * rather than just passing around the cpumask we pass around a function that + * returns the that cpumask for the given CPU. + */ +static void set_cpus_related(int i, int j, struct cpumask *(*get_cpumask)(int)) +{ + cpumask_set_cpu(i, get_cpumask(j)); + cpumask_set_cpu(j, get_cpumask(i)); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void set_cpus_unrelated(int i, int j, + struct cpumask *(*get_cpumask)(int)) +{ + cpumask_clear_cpu(i, get_cpumask(j)); + cpumask_clear_cpu(j, get_cpumask(i)); +} +#endif + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -590,6 +612,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); /* @@ -602,7 +626,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } + /* Init the cpumasks so the boot CPU is related to itself */ cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); + cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); if (smp_ops && smp_ops->probe) @@ -828,33 +854,6 @@ int cpu_first_thread_of_core(int core) } EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); -static void traverse_siblings_chip_id(int cpu, bool add, int chipid) -{ - const struct cpumask *mask; - struct device_node *np; - int i, plen; - const __be32 *prop; - - mask = add ? cpu_online_mask : cpu_present_mask; - for_each_cpu(i, mask) { - np = of_get_cpu_node(i, NULL); - if (!np) - continue; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int) && - of_read_number(prop, 1) == chipid) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } - of_node_put(np); - } -} - /* Must be called when no change can occur to cpu_present_mask, * i.e. during cpu online or offline. */ @@ -877,52 +876,93 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static void traverse_core_siblings(int cpu, bool add) +static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) { struct device_node *l2_cache, *np; - const struct cpumask *mask; - int i, chip, plen; - const __be32 *prop; - - /* First see if we have ibm,chip-id properties in cpu nodes */ - np = of_get_cpu_node(cpu, NULL); - if (np) { - chip = -1; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int)) - chip = of_read_number(prop, 1); - of_node_put(np); - if (chip >= 0) { - traverse_siblings_chip_id(cpu, add, chip); - return; - } - } + int i; l2_cache = cpu_to_l2cache(cpu); - mask = add ? cpu_online_mask : cpu_present_mask; - for_each_cpu(i, mask) { + if (!l2_cache) + return false; + + for_each_cpu(i, cpu_online_mask) { + /* + * when updating the marks the current CPU has not been marked + * online, but we need to update the cache masks + */ np = cpu_to_l2cache(i); if (!np) continue; - if (np == l2_cache) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } + + if (np == l2_cache) + set_cpus_related(cpu, i, mask_fn); + of_node_put(np); } of_node_put(l2_cache); + + return true; } +#ifdef CONFIG_HOTPLUG_CPU +static void remove_cpu_from_masks(int cpu) +{ + int i; + + /* NB: cpu_core_mask is a superset of the others */ + for_each_cpu(i, cpu_core_mask(cpu)) { + set_cpus_unrelated(cpu, i, cpu_core_mask); + set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); + set_cpus_unrelated(cpu, i, cpu_sibling_mask); + } +} +#endif + +static void add_cpu_to_masks(int cpu) +{ + int first_thread = cpu_first_thread_sibling(cpu); + int chipid = cpu_to_chip_id(cpu); + int i; + + /* + * This CPU will not be in the online mask yet so we need to manually + * add it to it's own thread sibling mask. + */ + cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) + if (cpu_online(i)) + set_cpus_related(i, cpu, cpu_sibling_mask); + + /* + * Copy the thread sibling mask into the cache sibling mask + * and mark any CPUs that share an L2 with this CPU. + */ + for_each_cpu(i, cpu_sibling_mask(cpu)) + set_cpus_related(cpu, i, cpu_l2_cache_mask); + update_mask_by_l2(cpu, cpu_l2_cache_mask); + + /* + * Copy the cache sibling mask into core sibling mask and mark + * any CPUs on the same chip as this CPU. + */ + for_each_cpu(i, cpu_l2_cache_mask(cpu)) + set_cpus_related(cpu, i, cpu_core_mask); + + if (chipid == -1) + return; + + for_each_cpu(i, cpu_online_mask) + if (cpu_to_chip_id(i) == chipid) + set_cpus_related(cpu, i, cpu_core_mask); +} + +static bool shared_caches; + /* Activate a secondary processor. */ void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - int i, base; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -945,22 +985,15 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif - /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i) && (cpu != base + i)) - continue; - cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); + /* Update topology CPU masks */ + add_cpu_to_masks(cpu); - /* cpu_core_map should be a superset of - * cpu_sibling_map even if we don't have cache - * information, so update the former here, too. - */ - cpumask_set_cpu(cpu, cpu_core_mask(base + i)); - cpumask_set_cpu(base + i, cpu_core_mask(cpu)); - } - traverse_core_siblings(cpu, true); + /* + * Check for any shared caches. Note that this must be done on a + * per-core basis because one core in the pair might be disabled. + */ + if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu))) + shared_caches = true; set_numa_node(numa_cpu_lookup_table[cpu]); set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); @@ -1003,35 +1036,65 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; -static __init long smp_setup_cpu_workfn(void *data __always_unused) +/* + * P9 has a slightly odd architecture where pairs of cores share an L2 cache. + * This topology makes it *much* cheaper to migrate tasks between adjacent cores + * since the migrated task remains cache hot. We want to take advantage of this + * at the scheduler level so an extra topology level is required. + */ +static int powerpc_shared_cache_flags(void) { - smp_ops->setup_cpu(boot_cpuid); - return 0; + return SD_SHARE_PKG_RESOURCES; } +/* + * We can't just pass cpu_l2_cache_mask() directly because + * returns a non-const pointer and the compiler barfs on that. + */ +static const struct cpumask *shared_cache_mask(int cpu) +{ + return cpu_l2_cache_mask(cpu); +} + +static struct sched_domain_topology_level power9_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { /* - * We want the setup_cpu() here to be called on the boot CPU, but - * init might run on any CPU, so make sure it's invoked on the boot - * CPU. + * We are running pinned to the boot CPU, see rest_init(). */ if (smp_ops && smp_ops->setup_cpu) - work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL); + smp_ops->setup_cpu(boot_cpuid); if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); dump_numa_cpu_topology(); - set_sched_topology(powerpc_topology); + /* + * If any CPU detects that it's sharing a cache with another CPU then + * use the deeper topology that is aware of this sharing. + */ + if (shared_caches) { + pr_info("Using shared cache scheduler topology\n"); + set_sched_topology(power9_topology); + } else { + pr_info("Using standard scheduler topology\n"); + set_sched_topology(powerpc_topology); + } } #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { int cpu = smp_processor_id(); - int base, i; int err; if (!smp_ops->cpu_disable) @@ -1042,14 +1105,7 @@ int __cpu_disable(void) return err; /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core && base + i < nr_cpu_ids; i++) { - cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu)); - cpumask_clear_cpu(cpu, cpu_core_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_core_mask(cpu)); - } - traverse_core_siblings(cpu, false); + remove_cpu_from_masks(cpu); return 0; } diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S index ba4dee3d233f..34b73a262709 100644 --- a/arch/powerpc/kernel/swsusp_32.S +++ b/arch/powerpc/kernel/swsusp_32.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/threads.h> #include <asm/processor.h> #include <asm/page.h> diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 988f38dced0f..82d8aae81c6a 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -179,7 +179,7 @@ nothing_to_copy: sld r3, r3, r0 li r0, 0 1: - dcbf r0,r3 + dcbf 0,r3 addi r3,r3,0x20 bdnz 1b diff --git a/arch/powerpc/kernel/swsusp_booke.S b/arch/powerpc/kernel/swsusp_booke.S index 553c1405ee05..88cfdbd530f1 100644 --- a/arch/powerpc/kernel/swsusp_booke.S +++ b/arch/powerpc/kernel/swsusp_booke.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Based on swsusp_32.S, modified for FSL BookE by * Anton Vorontsov <avorontsov@ru.mvista.com> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 4437c70c7c2b..b8d4a1dac39f 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -590,6 +590,17 @@ static void sysfs_create_dscr_default(void) if (cpu_has_feature(CPU_FTR_DSCR)) err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); } + +void __init record_spr_defaults(void) +{ + int cpu; + + if (cpu_has_feature(CPU_FTR_DSCR)) { + dscr_default = mfspr(SPRN_DSCR); + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + paca[cpu].dscr_default = dscr_default; + } +} #endif /* CONFIG_PPC64 */ #ifdef HAS_PPC_PMC_PA6T diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 4d6b1d3a747f..7ccb7f81f8db 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -17,13 +17,13 @@ #include <asm/ppc_asm.h> #ifdef CONFIG_PPC64 -#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func) -#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func) -#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func) -#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) -#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) -#define PPC64ONLY(func) .llong DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) -#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264) +#define SYSCALL(func) .8byte DOTSYM(sys_##func),DOTSYM(sys_##func) +#define COMPAT_SYS(func) .8byte DOTSYM(sys_##func),DOTSYM(compat_sys_##func) +#define PPC_SYS(func) .8byte DOTSYM(ppc_##func),DOTSYM(ppc_##func) +#define OLDSYS(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) +#define SYS32ONLY(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) +#define PPC64ONLY(func) .8byte DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) +#define SYSX(f, f3264, f32) .8byte DOTSYM(f),DOTSYM(f3264) #else #define SYSCALL(func) .long sys_##func #define COMPAT_SYS(func) .long sys_##func diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index a753b72efbc0..8cdd852aedd1 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * temp.c Thermal management for cpu's with Thermal Assist Units * @@ -187,7 +188,7 @@ static void tau_timeout(void * info) local_irq_restore(flags); } -static void tau_timeout_smp(unsigned long unused) +static void tau_timeout_smp(struct timer_list *unused) { /* schedule ourselves to be run again */ @@ -229,8 +230,7 @@ int __init TAU_init(void) /* first, set up the window shrinking timer */ - init_timer(&tau_timer); - tau_timer.function = tau_timeout_smp; + timer_setup(&tau_timer, tau_timeout_smp, 0); tau_timer.expires = jiffies + shrink_timer; add_timer(&tau_timer); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index c4ba37822ba0..b92ac8e711db 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Transactional memory support routines to reclaim and recheckpoint * transactional process state. @@ -79,15 +80,12 @@ _GLOBAL(tm_abort) blr /* void tm_reclaim(struct thread_struct *thread, - * unsigned long orig_msr, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding * transactions and updates thread->regs.tm_ckpt_* with the * original checkpointed state. Note that thread->regs is * unchanged. - * - FP regs are written back to thread->transact_fpr before - * reclaiming. These are the transactional (current) versions. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -98,9 +96,9 @@ _GLOBAL(tm_abort) * Call with IRQs off, stacks get all out of sync for some periods in here! */ _GLOBAL(tm_reclaim) - mfcr r6 + mfcr r5 mflr r0 - stw r6, 8(r1) + stw r5, 8(r1) std r0, 16(r1) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) @@ -108,7 +106,6 @@ _GLOBAL(tm_reclaim) /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ std r3, STK_PARAM(R3)(r1) - std r4, STK_PARAM(R4)(r1) SAVE_NVGPRS(r1) /* We need to setup MSR for VSX register save instructions. */ @@ -138,8 +135,8 @@ _GLOBAL(tm_reclaim) std r1, PACAR1(r13) /* Clear MSR RI since we are about to change r1, EE is already off. */ - li r4, 0 - mtmsrd r4, 1 + li r5, 0 + mtmsrd r5, 1 /* * BE CAREFUL HERE: @@ -151,7 +148,7 @@ _GLOBAL(tm_reclaim) * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ - TRECLAIM(R5) /* Cause in r5 */ + TRECLAIM(R4) /* Cause in r4 */ /* ******************** GPRs ******************** */ /* Stash the checkpointed r13 away in the scratch SPR and get the real @@ -242,40 +239,30 @@ _GLOBAL(tm_reclaim) /* ******************** FPR/VR/VSRs ************ - * After reclaiming, capture the checkpointed FPRs/VRs /if used/. - * - * (If VSX used, FP and VMX are implied. Or, we don't need to look - * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) - * - * We're passed the thread's MSR as the second parameter + * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these * instructions! */ - ld r4, STK_PARAM(R4)(r1) /* Second parameter, MSR * */ mr r3, r12 - andis. r0, r4, MSR_VEC@h - beq dont_backup_vec + /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 -dont_backup_vec: + + /* VRSAVE */ mfspr r0, SPRN_VRSAVE std r0, THREAD_CKVRSAVE(r3) - andi. r0, r4, MSR_FP - beq dont_backup_fp - + /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ - mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) -dont_backup_fp: /* TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure @@ -343,22 +330,19 @@ _GLOBAL(__tm_recheckpoint) */ subi r7, r7, STACK_FRAME_OVERHEAD + /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 - /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ - - /* Enable FP/vec in MSR if necessary! */ - lis r5, MSR_VEC@h + mr r5, r6 ori r5, r5, MSR_FP - and. r5, r4, r5 - beq restore_gprs /* if neither, skip both */ - +#ifdef CONFIG_ALTIVEC + oris r5, r5, MSR_VEC@h +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION - oris r5, r5, MSR_VSX@h + oris r5,r5, MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ - mtmsr r5 + mtmsrd r5 #ifdef CONFIG_ALTIVEC /* @@ -367,28 +351,20 @@ _GLOBAL(__tm_recheckpoint) * thread.fp_state[] version holds the 'live' (transactional) * and will be loaded subsequently by any FPUnavailable trap. */ - andis. r0, r4, MSR_VEC@h - beq dont_restore_vec - addi r8, r3, THREAD_CKVRSTATE li r5, VRSTATE_VSCR lvx v0, r8, r5 mtvscr v0 REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ -dont_restore_vec: ld r5, THREAD_CKVRSAVE(r3) mtspr SPRN_VRSAVE, r5 #endif - andi. r0, r4, MSR_FP - beq dont_restore_fp - addi r8, r3, THREAD_CKFPSTATE lfd fr0, FPSTATE_FPSCR(r8) MTFSF_L(fr0) REST_32FPRS_VSRS(0, R4, R8) -dont_restore_fp: mtmsr r6 /* FP/Vec off again! */ restore_gprs: diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index 729dffc5f7bc..d22d8bafb643 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for the powerpc trace subsystem # diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index 32509de6ce4c..4741fe112f05 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Code for replacing ftrace calls with jumps. * diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index c98e90b4ea7b..3f3e81852422 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -110,9 +110,9 @@ ftrace_call: /* NIP has not been altered, skip over further checks */ beq 1f - /* Check if there is an active kprobe on us */ + /* Check if there is an active jprobe on us */ subi r3, r14, 4 - bl is_current_kprobe_addr + bl __is_active_jprobe nop /* @@ -181,34 +181,25 @@ _GLOBAL(ftrace_stub) * - we have no stack frame and can not allocate one * - LR points back to the original caller (in A) * - CTR holds the new NIP in C - * - r0 & r12 are free - * - * r0 can't be used as the base register for a DS-form load or store, so - * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. + * - r0, r11 & r12 are free */ livepatch_handler: CURRENT_THREAD_INFO(r12, r1) - /* Save stack pointer into r0 */ - mr r0, r1 - /* Allocate 3 x 8 bytes */ - ld r1, TI_livepatch_sp(r12) - addi r1, r1, 24 - std r1, TI_livepatch_sp(r12) + ld r11, TI_livepatch_sp(r12) + addi r11, r11, 24 + std r11, TI_livepatch_sp(r12) /* Save toc & real LR on livepatch stack */ - std r2, -24(r1) + std r2, -24(r11) mflr r12 - std r12, -16(r1) + std r12, -16(r11) /* Store stack end marker */ lis r12, STACK_END_MAGIC@h ori r12, r12, STACK_END_MAGIC@l - std r12, -8(r1) - - /* Restore real stack pointer */ - mr r1, r0 + std r12, -8(r11) /* Put ctr in r12 for global entry and branch there */ mfctr r12 @@ -216,36 +207,30 @@ livepatch_handler: /* * Now we are returning from the patched function to the original - * caller A. We are free to use r0 and r12, and we can use r2 until we + * caller A. We are free to use r11, r12 and we can use r2 until we * restore it. */ CURRENT_THREAD_INFO(r12, r1) - /* Save stack pointer into r0 */ - mr r0, r1 - - ld r1, TI_livepatch_sp(r12) + ld r11, TI_livepatch_sp(r12) /* Check stack marker hasn't been trashed */ lis r2, STACK_END_MAGIC@h ori r2, r2, STACK_END_MAGIC@l - ld r12, -8(r1) + ld r12, -8(r11) 1: tdne r12, r2 EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 /* Restore LR & toc from livepatch stack */ - ld r12, -16(r1) + ld r12, -16(r11) mtlr r12 - ld r2, -24(r1) + ld r2, -24(r11) /* Pop livepatch stack frame */ - CURRENT_THREAD_INFO(r12, r0) - subi r1, r1, 24 - std r1, TI_livepatch_sp(r12) - - /* Restore real stack pointer */ - mr r1, r0 + CURRENT_THREAD_INFO(r12, r1) + subi r11, r11, 24 + std r11, TI_livepatch_sp(r12) /* Return to original caller of live patched function */ blr diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bfcfd9ef09f2..f3eb61be0d30 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,6 +37,7 @@ #include <linux/kdebug.h> #include <linux/ratelimit.h> #include <linux/context_tracking.h> +#include <linux/smp.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> @@ -114,6 +115,28 @@ static void pmac_backlight_unblank(void) static inline void pmac_backlight_unblank(void) { } #endif +/* + * If oops/die is expected to crash the machine, return true here. + * + * This should not be expected to be 100% accurate, there may be + * notifiers registered or other unexpected conditions that may bring + * down the kernel. Or if the current process in the kernel is holding + * locks or has other critical state, the kernel may become effectively + * unusable anyway. + */ +bool die_will_crash(void) +{ + if (should_fadump_crash()) + return true; + if (kexec_should_crash(current)) + return true; + if (in_interrupt() || panic_on_oops || + !current->pid || is_global_init(current)) + return true; + + return false; +} + static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -162,21 +185,9 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, crash_fadump(regs, "die oops"); - /* - * A system reset (0x100) is a request to dump, so we always send - * it through the crashdump code. - */ - if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) { + if (kexec_should_crash(current)) crash_kexec(regs); - /* - * We aren't the primary crash CPU. We need to send it - * to a holding pattern to avoid it ending up in the panic - * code. - */ - crash_kexec_secondary(regs); - } - if (!signr) return; @@ -202,18 +213,25 @@ NOKPROBE_SYMBOL(oops_end); static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP NR_CPUS=%d ", NR_CPUS); -#endif + + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + printk("LE "); + else + printk("BE "); + + if (IS_ENABLED(CONFIG_PREEMPT)) + pr_cont("PREEMPT "); + + if (IS_ENABLED(CONFIG_SMP)) + pr_cont("SMP NR_CPUS=%d ", NR_CPUS); + if (debug_pagealloc_enabled()) - printk("DEBUG_PAGEALLOC "); -#ifdef CONFIG_NUMA - printk("NUMA "); -#endif - printk("%s\n", ppc_md.name ? ppc_md.name : ""); + pr_cont("DEBUG_PAGEALLOC "); + + if (IS_ENABLED(CONFIG_NUMA)) + pr_cont("NUMA "); + + pr_cont("%s\n", ppc_md.name ? ppc_md.name : ""); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; @@ -288,23 +306,52 @@ void system_reset_exception(struct pt_regs *regs) if (!nested) nmi_enter(); + __this_cpu_inc(irq_stat.sreset_irqs); + /* See if any machine dependent calls */ if (ppc_md.system_reset_exception) { if (ppc_md.system_reset_exception(regs)) goto out; } - die("System Reset", regs, SIGABRT); + if (debugger(regs)) + goto out; + + /* + * A system reset is a request to dump, so we always send + * it through the crashdump code (if fadump or kdump are + * registered). + */ + crash_fadump(regs, "System Reset"); + + crash_kexec(regs); + + /* + * We aren't the primary crash CPU. We need to send it + * to a holding pattern to avoid it ending up in the panic + * code. + */ + crash_kexec_secondary(regs); + + /* + * No debugger or crash dump registered, print logs then + * panic. + */ + __die("System Reset", regs, SIGABRT); + + mdelay(2*MSEC_PER_SEC); /* Wait a little while for others to print */ + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + nmi_panic(regs, "System Reset"); out: #ifdef CONFIG_PPC_BOOK3S_64 BUG_ON(get_paca()->in_nmi == 0); if (get_paca()->in_nmi > 1) - panic("Unrecoverable nested System Reset"); + nmi_panic(regs, "Unrecoverable nested System Reset"); #endif /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) - panic("Unrecoverable System Reset"); + nmi_panic(regs, "Unrecoverable System Reset"); if (!nested) nmi_exit(); @@ -312,39 +359,6 @@ out: /* What should we do here? We could issue a shutdown or hard reset. */ } -#ifdef CONFIG_PPC64 -/* - * This function is called in real mode. Strictly no printk's please. - * - * regs->nip and regs->msr contains srr0 and ssr1. - */ -long machine_check_early(struct pt_regs *regs) -{ - long handled = 0; - - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); - return handled; -} - -long hmi_exception_realmode(struct pt_regs *regs) -{ - __this_cpu_inc(irq_stat.hmi_exceptions); - - wait_for_subcore_guest_exit(); - - if (ppc_md.hmi_exception_early) - ppc_md.hmi_exception_early(regs); - - wait_for_tb_resync(); - - return 0; -} - -#endif - /* * I/O accesses can cause machine checks on powermacs. * Check if the NIP corresponds to the address of a sync @@ -397,11 +411,6 @@ static inline int check_io_access(struct pt_regs *regs) /* On 4xx, the reason for the machine check or program exception is in the ESR. */ #define get_reason(regs) ((regs)->dsisr) -#ifndef CONFIG_FSL_BOOKE -#define get_mc_reason(regs) ((regs)->dsisr) -#else -#define get_mc_reason(regs) (mfspr(SPRN_MCSR)) -#endif #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR @@ -415,111 +424,21 @@ static inline int check_io_access(struct pt_regs *regs) /* On non-4xx, the reason for the machine check or program exception is in the MSR. */ #define get_reason(regs) ((regs)->msr) -#define get_mc_reason(regs) ((regs)->msr) -#define REASON_TM 0x200000 -#define REASON_FP 0x100000 -#define REASON_ILLEGAL 0x80000 -#define REASON_PRIVILEGED 0x40000 -#define REASON_TRAP 0x20000 +#define REASON_TM SRR1_PROGTM +#define REASON_FP SRR1_PROGFPE +#define REASON_ILLEGAL SRR1_PROGILL +#define REASON_PRIVILEGED SRR1_PROGPRIV +#define REASON_TRAP SRR1_PROGTRAP #define single_stepping(regs) ((regs)->msr & MSR_SE) #define clear_single_step(regs) ((regs)->msr &= ~MSR_SE) #endif -#if defined(CONFIG_4xx) -int machine_check_4xx(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - if (reason & ESR_IMCP) { - printk("Instruction"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - } else - printk("Data"); - printk(" machine check in kernel mode.\n"); - - return 0; -} - -int machine_check_440A(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - printk("Machine check in kernel mode.\n"); - if (reason & ESR_IMCP){ - printk("Instruction Synchronous Machine Check exception\n"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - } - else { - u32 mcsr = mfspr(SPRN_MCSR); - if (mcsr & MCSR_IB) - printk("Instruction Read PLB Error\n"); - if (mcsr & MCSR_DRB) - printk("Data Read PLB Error\n"); - if (mcsr & MCSR_DWB) - printk("Data Write PLB Error\n"); - if (mcsr & MCSR_TLBP) - printk("TLB Parity Error\n"); - if (mcsr & MCSR_ICP){ - flush_instruction_cache(); - printk("I-Cache Parity Error\n"); - } - if (mcsr & MCSR_DCSP) - printk("D-Cache Search Parity Error\n"); - if (mcsr & MCSR_DCFP) - printk("D-Cache Flush Parity Error\n"); - if (mcsr & MCSR_IMPE) - printk("Machine Check exception is imprecise\n"); - - /* Clear MCSR */ - mtspr(SPRN_MCSR, mcsr); - } - return 0; -} - -int machine_check_47x(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - u32 mcsr; - - printk(KERN_ERR "Machine check in kernel mode.\n"); - if (reason & ESR_IMCP) { - printk(KERN_ERR - "Instruction Synchronous Machine Check exception\n"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - return 0; - } - mcsr = mfspr(SPRN_MCSR); - if (mcsr & MCSR_IB) - printk(KERN_ERR "Instruction Read PLB Error\n"); - if (mcsr & MCSR_DRB) - printk(KERN_ERR "Data Read PLB Error\n"); - if (mcsr & MCSR_DWB) - printk(KERN_ERR "Data Write PLB Error\n"); - if (mcsr & MCSR_TLBP) - printk(KERN_ERR "TLB Parity Error\n"); - if (mcsr & MCSR_ICP) { - flush_instruction_cache(); - printk(KERN_ERR "I-Cache Parity Error\n"); - } - if (mcsr & MCSR_DCSP) - printk(KERN_ERR "D-Cache Search Parity Error\n"); - if (mcsr & PPC47x_MCSR_GPR) - printk(KERN_ERR "GPR Parity Error\n"); - if (mcsr & PPC47x_MCSR_FPR) - printk(KERN_ERR "FPR Parity Error\n"); - if (mcsr & PPC47x_MCSR_IPR) - printk(KERN_ERR "Machine Check exception is imprecise\n"); - - /* Clear MCSR */ - mtspr(SPRN_MCSR, mcsr); - - return 0; -} -#elif defined(CONFIG_E500) +#if defined(CONFIG_E500) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); + unsigned long pvr = mfspr(SPRN_PVR); unsigned long reason = mcsr; int recoverable = 1; @@ -561,8 +480,15 @@ int machine_check_e500mc(struct pt_regs *regs) * may still get logged and cause a machine check. We should * only treat the non-write shadow case as non-recoverable. */ - if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) - recoverable = 0; + /* On e6500 core, L1 DCWS (Data cache write shadow mode) bit + * is not implemented but L1 data cache always runs in write + * shadow mode. Hence on data cache parity errors HW will + * automatically invalidate the L1 Data Cache. + */ + if (PVR_VER(pvr) != PVR_VER_E6500) { + if (!(mfspr(SPRN_L1CSR2) & L1CSR2_DCWS)) + recoverable = 0; + } } if (reason & MCSR_L2MMU_MHIT) { @@ -618,7 +544,7 @@ silent_out: int machine_check_e500(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = mfspr(SPRN_MCSR); if (reason & MCSR_BUS_RBERR) { if (fsl_rio_mcheck_exception(regs)) @@ -665,7 +591,7 @@ int machine_check_generic(struct pt_regs *regs) #elif defined(CONFIG_E200) int machine_check_e200(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = mfspr(SPRN_MCSR); printk("Machine check in kernel mode.\n"); printk("Caused by (from MCSR=%lx): ", reason); @@ -687,35 +613,10 @@ int machine_check_e200(struct pt_regs *regs) return 0; } -#elif defined(CONFIG_PPC_8xx) -int machine_check_8xx(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - pr_err("Machine check in kernel mode.\n"); - pr_err("Caused by (from SRR1=%lx): ", reason); - if (reason & 0x40000000) - pr_err("Fetch error at address %lx\n", regs->nip); - else - pr_err("Data access error at address %lx\n", regs->dar); - -#ifdef CONFIG_PCI - /* the qspan pci read routines can cause machine checks -- Cort - * - * yuck !!! that totally needs to go away ! There are better ways - * to deal with that than having a wart in the mcheck handler. - * -- BenH - */ - bad_page_fault(regs, regs->dar, SIGBUS); - return 1; -#else - return 0; -#endif -} -#else +#elif defined(CONFIG_PPC32) int machine_check_generic(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = regs->msr; printk("Machine check in kernel mode.\n"); printk("Caused by (from SRR1=%lx): ", reason); @@ -752,10 +653,14 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { - enum ctx_state prev_state = exception_enter(); int recover = 0; + bool nested = in_nmi(); + if (!nested) + nmi_enter(); - __this_cpu_inc(irq_stat.mce_exceptions); + /* 64s accounts the mce in machine_check_early when in HVMODE */ + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE)) + __this_cpu_inc(irq_stat.mce_exceptions); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -783,10 +688,11 @@ void machine_check_exception(struct pt_regs *regs) /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) - panic("Unrecoverable Machine check"); + nmi_panic(regs, "Unrecoverable Machine check"); bail: - exception_exit(prev_state); + if (!nested) + nmi_exit(); } void SMIException(struct pt_regs *regs) @@ -794,6 +700,187 @@ void SMIException(struct pt_regs *regs) die("System Management Interrupt", regs, SIGABRT); } +#ifdef CONFIG_VSX +static void p9_hmi_special_emu(struct pt_regs *regs) +{ + unsigned int ra, rb, t, i, sel, instr, rc; + const void __user *addr; + u8 vbuf[16], *vdst; + unsigned long ea, msr, msr_mask; + bool swap; + + if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + return; + + /* + * lxvb16x opcode: 0x7c0006d8 + * lxvd2x opcode: 0x7c000698 + * lxvh8x opcode: 0x7c000658 + * lxvw4x opcode: 0x7c000618 + */ + if ((instr & 0xfc00073e) != 0x7c000618) { + pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx" + " instr=%08x\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr); + return; + } + + /* Grab vector registers into the task struct */ + msr = regs->msr; /* Grab msr before we flush the bits */ + flush_vsx_to_thread(current); + enable_kernel_altivec(); + + /* + * Is userspace running with a different endian (this is rare but + * not impossible) + */ + swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + + /* Decode the instruction */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + t = (instr >> 21) & 0x1f; + if (instr & 1) + vdst = (u8 *)¤t->thread.vr_state.vr[t]; + else + vdst = (u8 *)¤t->thread.fp_state.fpr[t][0]; + + /* Grab the vector address */ + ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0); + if (is_32bit_task()) + ea &= 0xfffffffful; + addr = (__force const void __user *)ea; + + /* Check it */ + if (!access_ok(VERIFY_READ, addr, 16)) { + pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + /* Read the vector */ + rc = 0; + if ((unsigned long)addr & 0xfUL) + /* unaligned case */ + rc = __copy_from_user_inatomic(vbuf, addr, 16); + else + __get_user_atomic_128_aligned(vbuf, addr, rc); + if (rc) { + pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, regs->nip, + instr, (unsigned long) addr); + + /* Grab instruction "selector" */ + sel = (instr >> 6) & 3; + + /* + * Check to make sure the facility is actually enabled. This + * could happen if we get a false positive hit. + * + * lxvd2x/lxvw4x always check MSR VSX sel = 0,2 + * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3 + */ + msr_mask = MSR_VSX; + if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */ + msr_mask = MSR_VEC; + if (!(msr & msr_mask)) { + pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx" + " instr=%08x msr:%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, msr); + return; + } + + /* Do logging here before we modify sel based on endian */ + switch (sel) { + case 0: /* lxvw4x */ + PPC_WARN_EMULATED(lxvw4x, regs); + break; + case 1: /* lxvh8x */ + PPC_WARN_EMULATED(lxvh8x, regs); + break; + case 2: /* lxvd2x */ + PPC_WARN_EMULATED(lxvd2x, regs); + break; + case 3: /* lxvb16x */ + PPC_WARN_EMULATED(lxvb16x, regs); + break; + } + +#ifdef __LITTLE_ENDIAN__ + /* + * An LE kernel stores the vector in the task struct as an LE + * byte array (effectively swapping both the components and + * the content of the components). Those instructions expect + * the components to remain in ascending address order, so we + * swap them back. + * + * If we are running a BE user space, the expectation is that + * of a simple memcpy, so forcing the emulation to look like + * a lxvb16x should do the trick. + */ + if (swap) + sel = 3; + + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i]; + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i]; + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i]; + break; + case 3: /* lxvb16x */ + for (i = 0; i < 16; i++) + vdst[i] = vbuf[15-i]; + break; + } +#else /* __LITTLE_ENDIAN__ */ + /* On a big endian kernel, a BE userspace only needs a memcpy */ + if (!swap) + sel = 3; + + /* Otherwise, we need to swap the content of the components */ + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]); + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]); + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]); + break; + case 3: /* lxvb16x */ + memcpy(vdst, vbuf, 16); + break; + } +#endif /* !__LITTLE_ENDIAN__ */ + + /* Go to next instruction */ + regs->nip += 4; +} +#endif /* CONFIG_VSX */ + void handle_hmi_exception(struct pt_regs *regs) { struct pt_regs *old_regs; @@ -801,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_VSX + /* Real mode flagged P9 special emu is needed */ + if (local_paca->hmi_p9_special_emu) { + local_paca->hmi_p9_special_emu = 0; + + /* + * We don't want to take page faults while doing the + * emulation, we just replay the instruction if necessary. + */ + pagefault_disable(); + p9_hmi_special_emu(regs); + pagefault_enable(); + } +#endif /* CONFIG_VSX */ + if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); @@ -1235,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs) * - A treclaim is attempted when non transactional. * - A tend is illegally attempted. * - writing a TM SPR when transactional. - */ - if (!user_mode(regs) && - report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { - regs->nip += 4; - goto bail; - } - /* If usermode caused this, it's done something illegal and + * + * If usermode caused this, it's done something illegal and * gets a SIGILL slap on the wrist. We call it an illegal * operand to distinguish from the instruction just being bad * (e.g. executing a 'tend' on a CPU without TM!); it's an @@ -1582,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs) /* Reclaim didn't save out any FPRs to transact_fprs. */ /* Enable FP for the task: */ - regs->msr |= (MSR_FP | current->thread.fpexc_mode); + current->thread.load_fp = 1; /* This loads and recheckpoints the FP registers from * thread.fpr[]. They will remain in registers after the @@ -1590,15 +1687,7 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, MSR_FP); - - /* If VMX is in use, get the transactional values back */ - if (regs->msr & MSR_VEC) { - msr_check_and_set(MSR_VEC); - load_vr_state(¤t->thread.vr_state); - /* At this point all the VSX state is loaded, so enable it */ - regs->msr |= MSR_VSX; - } + tm_recheckpoint(¤t->thread); } void altivec_unavailable_tm(struct pt_regs *regs) @@ -1611,21 +1700,13 @@ void altivec_unavailable_tm(struct pt_regs *regs) "MSR=%lx\n", regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC; - tm_recheckpoint(¤t->thread, MSR_VEC); + current->thread.load_vec = 1; + tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; - - if (regs->msr & MSR_FP) { - msr_check_and_set(MSR_FP); - load_fp_state(¤t->thread.fp_state); - regs->msr |= MSR_VSX; - } } void vsx_unavailable_tm(struct pt_regs *regs) { - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This works similarly, * though we're loading both FP and VEC registers in here. * @@ -1639,29 +1720,13 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.used_vsr = 1; - /* If FP and VMX are already loaded, we have all the state we need */ - if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { - regs->msr |= MSR_VSX; - return; - } - /* This reclaims FP and/or VR regs if they're already enabled */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | - MSR_VSX; - - /* This loads & recheckpoints FP and VRs; but we have - * to be sure not to overwrite previously-valid state. - */ - tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); - - msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC)); + current->thread.load_vec = 1; + current->thread.load_fp = 1; - if (orig_msr & MSR_FP) - load_fp_state(¤t->thread.fp_state); - if (orig_msr & MSR_VEC) - load_vr_state(¤t->thread.vr_state); + tm_recheckpoint(¤t->thread); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ @@ -1672,24 +1737,6 @@ void performance_monitor_exception(struct pt_regs *regs) perf_irq(regs); } -#ifdef CONFIG_8xx -void SoftwareEmulation(struct pt_regs *regs) -{ - CHECK_FULL_REGS(regs); - - if (!user_mode(regs)) { - debugger(regs); - die("Kernel Mode Unimplemented Instruction or SW FPU Emulation", - regs, SIGFPE); - } - - if (!emulate_math(regs)) - return; - - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); -} -#endif /* CONFIG_8xx */ - #ifdef CONFIG_PPC_ADV_DEBUG_REGS static void handle_debug(struct pt_regs *regs, unsigned long debug_status) { @@ -2037,6 +2084,10 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), WARN_EMULATED_SETUP(lq_stq), + WARN_EMULATED_SETUP(lxvw4x), + WARN_EMULATED_SETUP(lxvh8x), + WARN_EMULATED_SETUP(lxvd2x), + WARN_EMULATED_SETUP(lxvb16x), #endif }; diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index 003b20964ea0..5d105b8eeece 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -205,3 +205,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; } + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->gpr[1] <= ret->stack; + else + return regs->gpr[1] < ret->stack; +} diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 78a7449bf489..b8c434d1d459 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # List of files in the vdso, has to be asm only for now diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 6b2b69616e77..769c2624e0a6 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -232,15 +232,9 @@ __do_get_tspec: lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) /* Get a stable TB value */ -#ifdef CONFIG_8xx -2: mftbu r3 - mftbl r4 - mftbu r0 -#else -2: mfspr r3, SPRN_TBRU - mfspr r4, SPRN_TBRL - mfspr r0, SPRN_TBRU -#endif +2: MFTBU(r3) + MFTBL(r4) + MFTBU(r0) cmplw cr0,r3,r0 bne- 2b diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index e58ee10fa5c0..099a6db14e67 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This is the infamous ld script for the 32 bits vdso * library diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S index 6ac107ac402a..3f5ef035b0a9 100644 --- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S +++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/page.h> diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 31107bf5a61f..69cecb346269 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # List of files in the vdso, has to be asm only for now obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S index 64fb183a47c2..256fb9720298 100644 --- a/arch/powerpc/kernel/vdso64/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * This is the infamous ld script for the 64 bits vdso * library diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S index df60fca6a13d..1d56d81fe3b3 100644 --- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S +++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <linux/linkage.h> #include <asm/page.h> diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c index 2d8f6d8ccafc..8812085883fd 100644 --- a/arch/powerpc/kernel/vecemu.c +++ b/arch/powerpc/kernel/vecemu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Routines to emulate some Altivec/VMX instructions, specifically * those that can trap when given denormalized operands in Java mode. diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 0c123f3406cd..f314fd475491 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/reg.h> diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b1a250560198..0494e1566ee2 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifdef CONFIG_PPC64 #define PROVIDE32(x) PROVIDE(__unused__##x) #else @@ -8,7 +9,7 @@ #include <asm/cache.h> #include <asm/thread_info.h> -#ifdef CONFIG_STRICT_KERNEL_RWX +#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) #define STRICT_ALIGN_SIZE (1 << 24) #else #define STRICT_ALIGN_SIZE PAGE_SIZE diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index b67f8b03a32d..87da80ccced1 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Watchdog support on powerpc systems. * @@ -71,15 +72,20 @@ static inline void wd_smp_lock(unsigned long *flags) * This may be called from low level interrupt handlers at some * point in future. */ - local_irq_save(*flags); - while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) - cpu_relax(); + raw_local_irq_save(*flags); + hard_irq_disable(); /* Make it soft-NMI safe */ + while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) { + raw_local_irq_restore(*flags); + spin_until_cond(!test_bit(0, &__wd_smp_lock)); + raw_local_irq_save(*flags); + hard_irq_disable(); + } } static inline void wd_smp_unlock(unsigned long *flags) { clear_bit_unlock(0, &__wd_smp_lock); - local_irq_restore(*flags); + raw_local_irq_restore(*flags); } static void wd_lockup_ipi(struct pt_regs *regs) @@ -92,14 +98,13 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); + /* Do not panic from here because that can recurse into NMI IPI layer */ } -static void set_cpu_stuck(int cpu, u64 tb) +static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) { - cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); - cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); + cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); if (cpumask_empty(&wd_smp_cpus_pending)) { wd_smp_last_reset_tb = tb; cpumask_andnot(&wd_smp_cpus_pending, @@ -107,6 +112,10 @@ static void set_cpu_stuck(int cpu, u64 tb) &wd_smp_cpus_stuck); } } +static void set_cpu_stuck(int cpu, u64 tb) +{ + set_cpumask_stuck(cpumask_of(cpu), tb); +} static void watchdog_smp_panic(int cpu, u64 tb) { @@ -125,21 +134,22 @@ static void watchdog_smp_panic(int cpu, u64 tb) pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_pending)); - /* - * Try to trigger the stuck CPUs. - */ - for_each_cpu(c, &wd_smp_cpus_pending) { - if (c == cpu) - continue; - smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + if (!sysctl_hardlockup_all_cpu_backtrace) { + /* + * Try to trigger the stuck CPUs, unless we are going to + * get a backtrace on all of them anyway. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); } - smp_flush_nmi_ipi(1000000); - /* Take the stuck CPU out of the watch group */ - for_each_cpu(c, &wd_smp_cpus_pending) - set_cpu_stuck(c, tb); + /* Take the stuck CPUs out of the watch group */ + set_cpumask_stuck(&wd_smp_cpus_pending, tb); -out: wd_smp_unlock(&flags); printk_safe_flush(); @@ -152,6 +162,11 @@ out: if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); + + return; + +out: + wd_smp_unlock(&flags); } static void wd_smp_clear_cpu_pending(int cpu, u64 tb) @@ -204,6 +219,9 @@ void soft_nmi_interrupt(struct pt_regs *regs) return; nmi_enter(); + + __this_cpu_inc(irq_stat.soft_nmi_irqs); + tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { per_cpu(wd_timer_tb, cpu) = tb; @@ -246,9 +264,8 @@ static void wd_timer_reset(unsigned int cpu, struct timer_list *t) add_timer_on(t, cpu); } -static void wd_timer_fn(unsigned long data) +static void wd_timer_fn(struct timer_list *t) { - struct timer_list *t = this_cpu_ptr(&wd_timer); int cpu = smp_processor_id(); watchdog_timer_interrupt(cpu); @@ -258,9 +275,14 @@ static void wd_timer_fn(unsigned long data) void arch_touch_nmi_watchdog(void) { + unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); + u64 tb = get_tb(); - watchdog_timer_interrupt(cpu); + if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { + per_cpu(wd_timer_tb, cpu) = tb; + wd_smp_clear_cpu_pending(cpu, tb); + } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); @@ -270,7 +292,7 @@ static void start_watchdog_timer_on(unsigned int cpu) per_cpu(wd_timer_tb, cpu) = get_tb(); - setup_pinned_timer(t, wd_timer_fn, 0); + timer_setup(t, wd_timer_fn, TIMER_PINNED); wd_timer_reset(cpu, t); } @@ -283,6 +305,8 @@ static void stop_watchdog_timer_on(unsigned int cpu) static int start_wd_on_cpu(unsigned int cpu) { + unsigned long flags; + if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { WARN_ON(1); return 0; @@ -291,18 +315,17 @@ static int start_wd_on_cpu(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) return 0; - if (watchdog_suspended) - return 0; - if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) return 0; + wd_smp_lock(&flags); cpumask_set_cpu(cpu, &wd_cpus_enabled); if (cpumask_weight(&wd_cpus_enabled) == 1) { cpumask_set_cpu(cpu, &wd_smp_cpus_pending); wd_smp_last_reset_tb = get_tb(); } - smp_wmb(); + wd_smp_unlock(&flags); + start_watchdog_timer_on(cpu); return 0; @@ -310,12 +333,17 @@ static int start_wd_on_cpu(unsigned int cpu) static int stop_wd_on_cpu(unsigned int cpu) { + unsigned long flags; + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) return 0; /* Can happen in CPU unplug case */ stop_watchdog_timer_on(cpu); + wd_smp_lock(&flags); cpumask_clear_cpu(cpu, &wd_cpus_enabled); + wd_smp_unlock(&flags); + wd_smp_clear_cpu_pending(cpu, get_tb()); return 0; @@ -332,36 +360,39 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_reconfigure(void) +void watchdog_nmi_stop(void) { int cpu; - watchdog_calc_timeouts(); - for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); +} + +void watchdog_nmi_start(void) +{ + int cpu; + watchdog_calc_timeouts(); for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); } /* - * This runs after lockup_detector_init() which sets up watchdog_cpumask. + * Invoked from core watchdog init. */ -static int __init powerpc_watchdog_init(void) +int __init watchdog_nmi_probe(void) { int err; - watchdog_calc_timeouts(); - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", - start_wd_on_cpu, stop_wd_on_cpu); - if (err < 0) + err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) { pr_warn("Watchdog could not be initialized"); - + return err; + } return 0; } -arch_initcall(powerpc_watchdog_init); static void handle_backtrace_ipi(struct pt_regs *regs) { |

