diff options
Diffstat (limited to 'arch/powerpc/kernel')
103 files changed, 4345 insertions, 6006 deletions
diff --git a/arch/powerpc/kernel/.gitignore b/arch/powerpc/kernel/.gitignore index c5f676c3c224..67ebd3003c05 100644 --- a/arch/powerpc/kernel/.gitignore +++ b/arch/powerpc/kernel/.gitignore @@ -1 +1,2 @@ +prom_init_check vmlinux.lds diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 56dfa7a2a6f2..78a1b22d4fd8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -5,9 +5,6 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' -# Disable clang warning for using setjmp without setjmp.h header -CFLAGS_crash.o += $(call cc-disable-warning, builtin-requires-header) - ifdef CONFIG_PPC64 CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) endif @@ -22,6 +19,8 @@ CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom_init.o += $(call cc-option, -fno-stack-protector) +CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_prom_init.o += -ffreestanding ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code @@ -39,7 +38,6 @@ KASAN_SANITIZE_btext.o := n ifdef CONFIG_KASAN CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING -CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING endif @@ -52,7 +50,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ - paca.o nvram_64.o firmware.o + paca.o nvram_64.o firmware.o note.o obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o @@ -64,8 +62,7 @@ obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o -obj-$(CONFIG_PPC_970_NAP) += idle_power4.o -obj-$(CONFIG_PPC_P7_NAP) += idle_book3s.o +obj-$(CONFIG_PPC_BOOK3S_IDLE) += idle_book3s.o procfs-y := proc_powerpc.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o @@ -79,6 +76,7 @@ obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o +obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o ifdef CONFIG_PPC32 obj-$(CONFIG_E500) += idle_e500.o endif @@ -124,14 +122,6 @@ pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \ pci-common.o pci_of_scan.o obj-$(CONFIG_PCI_MSI) += msi.o -obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o crash.o \ - machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o -ifdef CONFIG_HAVE_IMA_KEXEC -ifdef CONFIG_IMA -obj-y += ima_kexec.o -endif -endif obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o @@ -155,17 +145,17 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o +ifneq ($(CONFIG_PPC_POWERNV)$(CONFIG_PPC_SVM),) +obj-y += ucall.o +endif + +obj-$(CONFIG_PPC_SECURE_BOOT) += secure_boot.o ima_arch.o secvar-ops.o +obj-$(CONFIG_PPC_SECVAR_SYSFS) += secvar-sysfs.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n KCOV_INSTRUMENT_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n -GCOV_PROFILE_machine_kexec_64.o := n -KCOV_INSTRUMENT_machine_kexec_64.o := n -UBSAN_SANITIZE_machine_kexec_64.o := n -GCOV_PROFILE_machine_kexec_32.o := n -KCOV_INSTRUMENT_machine_kexec_32.o := n -UBSAN_SANITIZE_machine_kexec_32.o := n GCOV_PROFILE_kprobes.o := n KCOV_INSTRUMENT_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n @@ -184,15 +174,13 @@ extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init.o -ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE -$(obj)/built-in.a: prom_init_check +extra-$(CONFIG_PPC_OF_BOOT_TRAMPOLINE) += prom_init_check -quiet_cmd_prom_init_check = CALL $< - cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" "$(obj)/prom_init.o" +quiet_cmd_prom_init_check = PROMCHK $@ + cmd_prom_init_check = $(CONFIG_SHELL) $< "$(NM)" $(obj)/prom_init.o; touch $@ -PHONY += prom_init_check -prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o - $(call cmd,prom_init_check) -endif +$(obj)/prom_init_check: $(src)/prom_init_check.sh $(obj)/prom_init.o FORCE + $(call if_changed,prom_init_check) +targets += prom_init_check clean-files := vmlinux.lds diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4ccb6b3a7fbd..c25e562f1cd9 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -127,6 +127,12 @@ int main(void) OFFSET(KSP_VSID, thread_struct, ksp_vsid); #else /* CONFIG_PPC64 */ OFFSET(PGDIR, thread_struct, pgdir); +#ifdef CONFIG_VMAP_STACK + OFFSET(SRR0, thread_struct, srr0); + OFFSET(SRR1, thread_struct, srr1); + OFFSET(DAR, thread_struct, dar); + OFFSET(DSISR, thread_struct, dsisr); +#endif #ifdef CONFIG_SPE OFFSET(THREAD_EVR0, thread_struct, evr[0]); OFFSET(THREAD_ACC, thread_struct, acc); @@ -385,28 +391,25 @@ int main(void) OFFSET(CFG_SYSCALL_MAP32, vdso_data, syscall_map_32); OFFSET(WTOM_CLOCK_SEC, vdso_data, wtom_clock_sec); OFFSET(WTOM_CLOCK_NSEC, vdso_data, wtom_clock_nsec); - OFFSET(STAMP_XTIME, vdso_data, stamp_xtime); + OFFSET(STAMP_XTIME_SEC, vdso_data, stamp_xtime_sec); + OFFSET(STAMP_XTIME_NSEC, vdso_data, stamp_xtime_nsec); OFFSET(STAMP_SEC_FRAC, vdso_data, stamp_sec_fraction); + OFFSET(CLOCK_HRTIMER_RES, vdso_data, hrtimer_res); +#ifdef CONFIG_PPC64 OFFSET(CFG_ICACHE_BLOCKSZ, vdso_data, icache_block_size); OFFSET(CFG_DCACHE_BLOCKSZ, vdso_data, dcache_block_size); OFFSET(CFG_ICACHE_LOGBLOCKSZ, vdso_data, icache_log_block_size); OFFSET(CFG_DCACHE_LOGBLOCKSZ, vdso_data, dcache_log_block_size); -#ifdef CONFIG_PPC64 OFFSET(CFG_SYSCALL_MAP64, vdso_data, syscall_map_64); - OFFSET(TVAL64_TV_SEC, timeval, tv_sec); - OFFSET(TVAL64_TV_USEC, timeval, tv_usec); + OFFSET(TVAL64_TV_SEC, __kernel_old_timeval, tv_sec); + OFFSET(TVAL64_TV_USEC, __kernel_old_timeval, tv_usec); +#endif + OFFSET(TSPC64_TV_SEC, __kernel_timespec, tv_sec); + OFFSET(TSPC64_TV_NSEC, __kernel_timespec, tv_nsec); OFFSET(TVAL32_TV_SEC, old_timeval32, tv_sec); OFFSET(TVAL32_TV_USEC, old_timeval32, tv_usec); - OFFSET(TSPC64_TV_SEC, timespec, tv_sec); - OFFSET(TSPC64_TV_NSEC, timespec, tv_nsec); OFFSET(TSPC32_TV_SEC, old_timespec32, tv_sec); OFFSET(TSPC32_TV_NSEC, old_timespec32, tv_nsec); -#else - OFFSET(TVAL32_TV_SEC, timeval, tv_sec); - OFFSET(TVAL32_TV_USEC, timeval, tv_usec); - OFFSET(TSPC32_TV_SEC, timespec, tv_sec); - OFFSET(TSPC32_TV_NSEC, timespec, tv_nsec); -#endif /* timeval/timezone offsets for use by vdso */ OFFSET(TZONE_TZ_MINWEST, timezone, tz_minuteswest); OFFSET(TZONE_TZ_DSTTIME, timezone, tz_dsttime); @@ -416,8 +419,10 @@ int main(void) DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE); + DEFINE(CLOCK_MAX, CLOCK_TAI); DEFINE(NSEC_PER_SEC, NSEC_PER_SEC); - DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); + DEFINE(EINVAL, EINVAL); + DEFINE(KTIME_LOW_RES, KTIME_LOW_RES); #ifdef CONFIG_BUG DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); @@ -506,6 +511,7 @@ int main(void) OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); + OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S index 2b4f3ec0acf7..1d308780e0d3 100644 --- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S +++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S @@ -231,7 +231,7 @@ _GLOBAL(__setup_cpu_e5500) blr #endif -/* flush L1 date cache, it can apply to e500v2, e500mc and e5500 */ +/* flush L1 data cache, it can apply to e500v2, e500mc and e5500 */ _GLOBAL(flush_dcache_L1) mfmsr r10 wrteei 0 diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 3239a9fe6c1c..a460298c7ddb 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -23,6 +23,7 @@ _GLOBAL(__setup_cpu_power7) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR li r4,(LPCR_LPES1 >> LPCR_LPES_SH) @@ -37,6 +38,7 @@ _GLOBAL(__restore_cpu_power7) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR li r4,(LPCR_LPES1 >> LPCR_LPES_SH) @@ -54,6 +56,7 @@ _GLOBAL(__setup_cpu_power8) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH @@ -76,6 +79,7 @@ _GLOBAL(__restore_cpu_power8) beqlr li r0,0 mtspr SPRN_LPID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR ori r3, r3, LPCR_PECEDH @@ -98,6 +102,7 @@ _GLOBAL(__setup_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mtspr SPRN_PID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) @@ -123,6 +128,7 @@ _GLOBAL(__restore_cpu_power9) mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 mtspr SPRN_PID,r0 + LOAD_REG_IMMEDIATE(r0, PCR_MASK) mtspr SPRN_PCR,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index bfe5f4a2886b..e745abc5457a 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -569,7 +569,7 @@ static struct cpu_spec __initdata cpu_specs[] = { #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC32 -#ifdef CONFIG_PPC_BOOK3S_32 +#ifdef CONFIG_PPC_BOOK3S_601 { /* 601 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00010000, @@ -583,6 +583,8 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "ppc601", }, +#endif /* CONFIG_PPC_BOOK3S_601 */ +#ifdef CONFIG_PPC_BOOK3S_6xx { /* 603 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00030000, @@ -1212,7 +1214,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "ppc603", }, -#endif /* CONFIG_PPC_BOOK3S_32 */ +#endif /* CONFIG_PPC_BOOK3S_6xx */ #ifdef CONFIG_PPC_8xx { /* 8xx */ .pvr_mask = 0xffff0000, diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c deleted file mode 100644 index d488311efab1..000000000000 --- a/arch/powerpc/kernel/crash.c +++ /dev/null @@ -1,374 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Architecture specific (PPC64) functions for kexec based crash dumps. - * - * Copyright (C) 2005, IBM Corp. - * - * Created by: Haren Myneni - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/reboot.h> -#include <linux/kexec.h> -#include <linux/export.h> -#include <linux/crash_dump.h> -#include <linux/delay.h> -#include <linux/irq.h> -#include <linux/types.h> - -#include <asm/processor.h> -#include <asm/machdep.h> -#include <asm/kexec.h> -#include <asm/prom.h> -#include <asm/smp.h> -#include <asm/setjmp.h> -#include <asm/debug.h> - -/* - * The primary CPU waits a while for all secondary CPUs to enter. This is to - * avoid sending an IPI if the secondary CPUs are entering - * crash_kexec_secondary on their own (eg via a system reset). - * - * The secondary timeout has to be longer than the primary. Both timeouts are - * in milliseconds. - */ -#define PRIMARY_TIMEOUT 500 -#define SECONDARY_TIMEOUT 1000 - -#define IPI_TIMEOUT 10000 -#define REAL_MODE_TIMEOUT 10000 - -static int time_to_dump; -/* - * crash_wake_offline should be set to 1 by platforms that intend to wake - * up offline cpus prior to jumping to a kdump kernel. Currently powernv - * sets it to 1, since we want to avoid things from happening when an - * offline CPU wakes up due to something like an HMI (malfunction error), - * which propagates to all threads. - */ -int crash_wake_offline; - -#define CRASH_HANDLER_MAX 3 -/* List of shutdown handles */ -static crash_shutdown_t crash_shutdown_handles[CRASH_HANDLER_MAX]; -static DEFINE_SPINLOCK(crash_handlers_lock); - -static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; -static int crash_shutdown_cpu = -1; - -static int handle_fault(struct pt_regs *regs) -{ - if (crash_shutdown_cpu == smp_processor_id()) - longjmp(crash_shutdown_buf, 1); - return 0; -} - -#ifdef CONFIG_SMP - -static atomic_t cpus_in_crash; -void crash_ipi_callback(struct pt_regs *regs) -{ - static cpumask_t cpus_state_saved = CPU_MASK_NONE; - - int cpu = smp_processor_id(); - - hard_irq_disable(); - if (!cpumask_test_cpu(cpu, &cpus_state_saved)) { - crash_save_cpu(regs, cpu); - cpumask_set_cpu(cpu, &cpus_state_saved); - } - - atomic_inc(&cpus_in_crash); - smp_mb__after_atomic(); - - /* - * Starting the kdump boot. - * This barrier is needed to make sure that all CPUs are stopped. - */ - while (!time_to_dump) - cpu_relax(); - - if (ppc_md.kexec_cpu_down) - ppc_md.kexec_cpu_down(1, 1); - -#ifdef CONFIG_PPC64 - kexec_smp_wait(); -#else - for (;;); /* FIXME */ -#endif - - /* NOTREACHED */ -} - -static void crash_kexec_prepare_cpus(int cpu) -{ - unsigned int msecs; - unsigned int ncpus = num_online_cpus() - 1;/* Excluding the panic cpu */ - int tries = 0; - int (*old_handler)(struct pt_regs *regs); - - printk(KERN_EMERG "Sending IPI to other CPUs\n"); - - if (crash_wake_offline) - ncpus = num_present_cpus() - 1; - - crash_send_ipi(crash_ipi_callback); - smp_wmb(); - -again: - /* - * FIXME: Until we will have the way to stop other CPUs reliably, - * the crash CPU will send an IPI and wait for other CPUs to - * respond. - */ - msecs = IPI_TIMEOUT; - while ((atomic_read(&cpus_in_crash) < ncpus) && (--msecs > 0)) - mdelay(1); - - /* Would it be better to replace the trap vector here? */ - - if (atomic_read(&cpus_in_crash) >= ncpus) { - printk(KERN_EMERG "IPI complete\n"); - return; - } - - printk(KERN_EMERG "ERROR: %d cpu(s) not responding\n", - ncpus - atomic_read(&cpus_in_crash)); - - /* - * If we have a panic timeout set then we can't wait indefinitely - * for someone to activate system reset. We also give up on the - * second time through if system reset fail to work. - */ - if ((panic_timeout > 0) || (tries > 0)) - return; - - /* - * A system reset will cause all CPUs to take an 0x100 exception. - * The primary CPU returns here via setjmp, and the secondary - * CPUs reexecute the crash_kexec_secondary path. - */ - old_handler = __debugger; - __debugger = handle_fault; - crash_shutdown_cpu = smp_processor_id(); - - if (setjmp(crash_shutdown_buf) == 0) { - printk(KERN_EMERG "Activate system reset (dumprestart) " - "to stop other cpu(s)\n"); - - /* - * A system reset will force all CPUs to execute the - * crash code again. We need to reset cpus_in_crash so we - * wait for everyone to do this. - */ - atomic_set(&cpus_in_crash, 0); - smp_mb(); - - while (atomic_read(&cpus_in_crash) < ncpus) - cpu_relax(); - } - - crash_shutdown_cpu = -1; - __debugger = old_handler; - - tries++; - goto again; -} - -/* - * This function will be called by secondary cpus. - */ -void crash_kexec_secondary(struct pt_regs *regs) -{ - unsigned long flags; - int msecs = SECONDARY_TIMEOUT; - - local_irq_save(flags); - - /* Wait for the primary crash CPU to signal its progress */ - while (crashing_cpu < 0) { - if (--msecs < 0) { - /* No response, kdump image may not have been loaded */ - local_irq_restore(flags); - return; - } - - mdelay(1); - } - - crash_ipi_callback(regs); -} - -#else /* ! CONFIG_SMP */ - -static void crash_kexec_prepare_cpus(int cpu) -{ - /* - * move the secondaries to us so that we can copy - * the new kernel 0-0x100 safely - * - * do this if kexec in setup.c ? - */ -#ifdef CONFIG_PPC64 - smp_release_cpus(); -#else - /* FIXME */ -#endif -} - -void crash_kexec_secondary(struct pt_regs *regs) -{ -} -#endif /* CONFIG_SMP */ - -/* wait for all the CPUs to hit real mode but timeout if they don't come in */ -#if defined(CONFIG_SMP) && defined(CONFIG_PPC64) -static void __maybe_unused crash_kexec_wait_realmode(int cpu) -{ - unsigned int msecs; - int i; - - msecs = REAL_MODE_TIMEOUT; - for (i=0; i < nr_cpu_ids && msecs > 0; i++) { - if (i == cpu) - continue; - - while (paca_ptrs[i]->kexec_state < KEXEC_STATE_REAL_MODE) { - barrier(); - if (!cpu_possible(i) || !cpu_online(i) || (msecs <= 0)) - break; - msecs--; - mdelay(1); - } - } - mb(); -} -#else -static inline void crash_kexec_wait_realmode(int cpu) {} -#endif /* CONFIG_SMP && CONFIG_PPC64 */ - -/* - * Register a function to be called on shutdown. Only use this if you - * can't reset your device in the second kernel. - */ -int crash_shutdown_register(crash_shutdown_t handler) -{ - unsigned int i, rc; - - spin_lock(&crash_handlers_lock); - for (i = 0 ; i < CRASH_HANDLER_MAX; i++) - if (!crash_shutdown_handles[i]) { - /* Insert handle at first empty entry */ - crash_shutdown_handles[i] = handler; - rc = 0; - break; - } - - if (i == CRASH_HANDLER_MAX) { - printk(KERN_ERR "Crash shutdown handles full, " - "not registered.\n"); - rc = 1; - } - - spin_unlock(&crash_handlers_lock); - return rc; -} -EXPORT_SYMBOL(crash_shutdown_register); - -int crash_shutdown_unregister(crash_shutdown_t handler) -{ - unsigned int i, rc; - - spin_lock(&crash_handlers_lock); - for (i = 0 ; i < CRASH_HANDLER_MAX; i++) - if (crash_shutdown_handles[i] == handler) - break; - - if (i == CRASH_HANDLER_MAX) { - printk(KERN_ERR "Crash shutdown handle not found\n"); - rc = 1; - } else { - /* Shift handles down */ - for (; i < (CRASH_HANDLER_MAX - 1); i++) - crash_shutdown_handles[i] = - crash_shutdown_handles[i+1]; - /* - * Reset last entry to NULL now that it has been shifted down, - * this will allow new handles to be added here. - */ - crash_shutdown_handles[i] = NULL; - rc = 0; - } - - spin_unlock(&crash_handlers_lock); - return rc; -} -EXPORT_SYMBOL(crash_shutdown_unregister); - -void default_machine_crash_shutdown(struct pt_regs *regs) -{ - unsigned int i; - int (*old_handler)(struct pt_regs *regs); - - /* - * This function is only called after the system - * has panicked or is otherwise in a critical state. - * The minimum amount of code to allow a kexec'd kernel - * to run successfully needs to happen here. - * - * In practice this means stopping other cpus in - * an SMP system. - * The kernel is broken so disable interrupts. - */ - hard_irq_disable(); - - /* - * Make a note of crashing cpu. Will be used in machine_kexec - * such that another IPI will not be sent. - */ - crashing_cpu = smp_processor_id(); - - /* - * If we came in via system reset, wait a while for the secondary - * CPUs to enter. - */ - if (TRAP(regs) == 0x100) - mdelay(PRIMARY_TIMEOUT); - - crash_kexec_prepare_cpus(crashing_cpu); - - crash_save_cpu(regs, crashing_cpu); - - time_to_dump = 1; - - crash_kexec_wait_realmode(crashing_cpu); - - machine_kexec_mask_interrupts(); - - /* - * Call registered shutdown routines safely. Swap out - * __debugger_fault_handler, and replace on exit. - */ - old_handler = __debugger_fault_handler; - __debugger_fault_handler = handle_fault; - crash_shutdown_cpu = smp_processor_id(); - for (i = 0; i < CRASH_HANDLER_MAX && crash_shutdown_handles[i]; i++) { - if (setjmp(crash_shutdown_buf) == 0) { - /* - * Insert syncs and delay to ensure - * instructions in the dangerous region don't - * leak away from this protected region. - */ - asm volatile("sync; isync"); - /* dangerous region */ - crash_shutdown_handles[i](); - asm volatile("sync; isync"); - } - } - crash_shutdown_cpu = -1; - __debugger_fault_handler = old_handler; - - if (ppc_md.kexec_cpu_down) - ppc_md.kexec_cpu_down(1, 0); -} diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index 5f66b95b6858..cc14aa6c4a1b 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -30,10 +30,10 @@ int set_dawr(struct arch_hw_breakpoint *brk) * DAWR length is stored in field MDR bits 48:53. Matches range in * doublewords (64 bits) baised by -1 eg. 0b000000=1DW and * 0b111111=64DW. - * brk->len is in bytes. + * brk->hw_len is in bytes. * This aligns up to double word size, shifts and does the bias. */ - mrd = ((brk->len + 7) >> 3) - 1; + mrd = ((brk->hw_len + 7) >> 3) - 1; dawrx |= (mrd & 0x3f) << (63 - 53); if (ppc_md.set_dawr) @@ -54,7 +54,7 @@ static ssize_t dawr_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - struct arch_hw_breakpoint null_brk = {0, 0, 0}; + struct arch_hw_breakpoint null_brk = {0}; size_t rc; /* Send error to user if they hypervisor won't allow us to write DAWR */ diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index 804b1a6196fa..f17ff1200eaa 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -33,7 +33,7 @@ void doorbell_global_ipi(int cpu) { u32 tag = get_hard_smp_processor_id(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); /* Order previous accesses vs. msgsnd, which is treated as a store */ ppc_msgsnd_sync(); ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); @@ -48,7 +48,7 @@ void doorbell_core_ipi(int cpu) { u32 tag = cpu_thread_in_core(cpu); - kvmppc_set_host_ipi(cpu, 1); + kvmppc_set_host_ipi(cpu); /* Order previous accesses vs. msgsnd, which is treated as a store */ ppc_msgsnd_sync(); ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); @@ -84,7 +84,7 @@ void doorbell_exception(struct pt_regs *regs) may_hard_irq_enable(); - kvmppc_set_host_ipi(smp_processor_id(), 0); + kvmppc_clear_host_ipi(smp_processor_id()); __this_cpu_inc(irq_stat.doorbell_irqs); smp_ipi_demux_relaxed(); /* already performed the barrier */ diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index a0879674a9c8..e486d1d78de2 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -122,18 +122,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) { struct iommu_table *tbl = get_iommu_table_base(dev); - if (!tbl) { - dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx" - ", table unavailable\n", mask); - return 0; - } - if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { dev->archdata.iommu_bypass = true; dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); return 1; } + if (!tbl) { + dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask); + return 0; + } + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", @@ -208,4 +207,6 @@ const struct dma_map_ops dma_iommu_ops = { .sync_single_for_device = dma_iommu_sync_for_device, .sync_sg_for_cpu = dma_iommu_sync_sg_for_cpu, .sync_sg_for_device = dma_iommu_sync_sg_for_device, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, }; diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index bd95318d2202..182b4047c1ef 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -101,7 +101,7 @@ static void __restore_cpu_cpufeatures(void) if (hv_mode) { mtspr(SPRN_LPID, 0); mtspr(SPRN_HFSCR, system_registers.hfscr); - mtspr(SPRN_PCR, 0); + mtspr(SPRN_PCR, PCR_MASK); } mtspr(SPRN_FSCR, system_registers.fscr); @@ -144,6 +144,7 @@ static void __init cpufeatures_setup_cpu(void) mtspr(SPRN_HFSCR, 0); } mtspr(SPRN_FSCR, 0); + mtspr(SPRN_PCR, PCR_MASK); /* * LPCR does not get cleared, to match behaviour with secondaries @@ -691,31 +692,62 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) return true; } +/* + * Handle POWER9 broadcast tlbie invalidation issue using + * cpu feature flag. + */ +static __init void update_tlbie_feature_flag(unsigned long pvr) +{ + if (PVR_VER(pvr) == PVR_POWER9) { + /* + * Set the tlbie feature flag for anything below + * Nimbus DD 2.3 and Cumulus DD 1.3 + */ + if ((pvr & 0xe000) == 0) { + /* Nimbus */ + if ((pvr & 0xfff) < 0x203) + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; + } else if ((pvr & 0xc000) == 0) { + /* Cumulus */ + if ((pvr & 0xfff) < 0x103) + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; + } else { + WARN_ONCE(1, "Unknown PVR"); + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_STQ_BUG; + } + + cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_ERAT_BUG; + } +} + static __init void cpufeatures_cpu_quirks(void) { - int version = mfspr(SPRN_PVR); + unsigned long version = mfspr(SPRN_PVR); /* * Not all quirks can be derived from the cpufeatures device tree. */ - if ((version & 0xffffefff) == 0x004e0200) - ; /* DD2.0 has no feature flag */ - else if ((version & 0xffffefff) == 0x004e0201) + if ((version & 0xffffefff) == 0x004e0200) { + /* DD2.0 has no feature flag */ + cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + } else if ((version & 0xffffefff) == 0x004e0201) { cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - else if ((version & 0xffffefff) == 0x004e0202) { + cur_cpu_spec->cpu_features |= CPU_FTR_P9_RADIX_PREFETCH_BUG; + } else if ((version & 0xffffefff) == 0x004e0202) { cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - } else if ((version & 0xffff0000) == 0x004e0000) + } else if ((version & 0xffff0000) == 0x004e0000) { /* DD2.1 and up have DD2_1 */ cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; + } if ((version & 0xffff0000) == 0x004e0000) { cur_cpu_spec->cpu_features &= ~(CPU_FTR_DAWR); - cur_cpu_spec->cpu_features |= CPU_FTR_P9_TLBIE_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TIDR; } + update_tlbie_feature_flag(version); /* * PKEY was not in the initial base or feature node * specification, but it should become optional in the next diff --git a/arch/powerpc/kernel/early_32.c b/arch/powerpc/kernel/early_32.c index 3482118ffe76..ef2ad4945904 100644 --- a/arch/powerpc/kernel/early_32.c +++ b/arch/powerpc/kernel/early_32.c @@ -19,10 +19,13 @@ */ notrace unsigned long __init early_init(unsigned long dt_ptr) { - unsigned long offset = reloc_offset(); + unsigned long kva, offset = reloc_offset(); + + kva = *PTRRELOC(&kernstart_virt_addr); /* First zero the BSS */ - memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); + if (kva == KERNELBASE) + memset(PTRRELOC(&__bss_start), 0, __bss_stop - __bss_start); /* * Identify the CPU type and fix up code sections @@ -32,5 +35,5 @@ notrace unsigned long __init early_init(unsigned long dt_ptr) apply_feature_fixups(); - return KERNELBASE + offset; + return kva + offset; } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c0e4b73191f3..17cb3e9b5697 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -150,6 +150,16 @@ static int __init eeh_setup(char *str) } __setup("eeh=", eeh_setup); +void eeh_show_enabled(void) +{ + if (eeh_has_flag(EEH_FORCE_DISABLED)) + pr_info("EEH: Recovery disabled by kernel parameter.\n"); + else if (eeh_has_flag(EEH_ENABLED)) + pr_info("EEH: Capable adapter found: recovery enabled.\n"); + else + pr_info("EEH: No capable adapters found: recovery disabled.\n"); +} + /* * This routine captures assorted PCI configuration space data * for the indicated PCI device, and puts them into a buffer @@ -410,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe) eeh_pe_mark_isolated(phb_pe); eeh_serialize_unlock(flags); - pr_err("EEH: PHB#%x failure detected, location: %s\n", + pr_debug("EEH: PHB#%x failure detected, location: %s\n", phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); - dump_stack(); eeh_send_failure_event(phb_pe); - return 1; out: eeh_serialize_unlock(flags); @@ -441,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) unsigned long flags; struct device_node *dn; struct pci_dev *dev; - struct eeh_pe *pe, *parent_pe, *phb_pe; + struct eeh_pe *pe, *parent_pe; int rc = 0; const char *location = NULL; @@ -460,8 +468,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) /* Access to IO BARs might get this far and still not want checking. */ if (!pe) { eeh_stats.ignored_check++; - pr_debug("EEH: Ignored check for %s\n", - eeh_pci_name(dev)); + eeh_edev_dbg(edev, "Ignored check\n"); return 0; } @@ -496,17 +503,16 @@ int eeh_dev_check_failure(struct eeh_dev *edev) rc = 1; if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; - if (pe->check_count % EEH_MAX_FAILS == 0) { + if (pe->check_count == EEH_MAX_FAILS) { dn = pci_device_to_OF_node(dev); if (dn) location = of_get_property(dn, "ibm,loc-code", NULL); - printk(KERN_ERR "EEH: %d reads ignored for recovering device at " - "location=%s driver=%s pci addr=%s\n", + eeh_edev_err(edev, "%d reads ignored for recovering device at location=%s driver=%s\n", pe->check_count, location ? location : "unknown", - eeh_driver_name(dev), eeh_pci_name(dev)); - printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", + eeh_driver_name(dev)); + eeh_edev_err(edev, "Might be infinite loop in %s driver\n", eeh_driver_name(dev)); dump_stack(); } @@ -573,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ - phb_pe = eeh_phb_pe_get(pe->phb); - pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", - pe->phb->global_number, pe->addr); - pr_err("EEH: PE location: %s, PHB location: %s\n", - eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); - dump_stack(); - + pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n", + __func__, pe->phb->global_number, pe->addr); eeh_send_failure_event(pe); return 1; @@ -697,7 +698,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function) return rc; } -static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, +static void eeh_disable_and_save_dev_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); @@ -708,7 +709,7 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, * state for the specified device */ if (!pdev || pdev == dev) - return NULL; + return; /* Ensure we have D0 power state */ pci_set_power_state(pdev, PCI_D0); @@ -721,18 +722,16 @@ static void *eeh_disable_and_save_dev_state(struct eeh_dev *edev, * interrupt from the device */ pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); - - return NULL; } -static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) +static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) { struct pci_dn *pdn = eeh_dev_to_pdn(edev); struct pci_dev *pdev = eeh_dev_to_pci_dev(edev); struct pci_dev *dev = userdata; if (!pdev) - return NULL; + return; /* Apply customization from firmware */ if (pdn && eeh_ops->restore_config) @@ -741,8 +740,6 @@ static void *eeh_restore_dev_state(struct eeh_dev *edev, void *userdata) /* The caller should restore state for the specified device */ if (pdev != dev) pci_restore_state(pdev); - - return NULL; } int eeh_restore_vf_config(struct pci_dn *pdn) @@ -868,7 +865,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat * the indicated device and its children so that the bunch of the * devices could be reset properly. */ -static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) +static void eeh_set_dev_freset(struct eeh_dev *edev, void *flag) { struct pci_dev *dev; unsigned int *freset = (unsigned int *)flag; @@ -876,8 +873,6 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) dev = eeh_dev_to_pci_dev(edev); if (dev) *freset |= dev->needs_freset; - - return NULL; } static void eeh_pe_refreeze_passed(struct eeh_pe *root) @@ -1063,23 +1058,6 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; -void eeh_probe_devices(void) -{ - struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - if (eeh_enabled()) - pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); - else - pr_info("EEH: No capable adapters found\n"); - -} - /** * eeh_init - EEH initialization * @@ -1120,6 +1098,8 @@ static int eeh_init(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) eeh_dev_phb_init_dynamic(hose); + eeh_addr_cache_init(); + /* Initialize EEH event */ return eeh_event_init(); } @@ -1190,15 +1170,14 @@ void eeh_add_device_late(struct pci_dev *dev) struct pci_dn *pdn; struct eeh_dev *edev; - if (!dev || !eeh_enabled()) + if (!dev) return; - pr_debug("EEH: Adding device %s\n", pci_name(dev)); - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); edev = pdn_to_eeh_dev(pdn); + eeh_edev_dbg(edev, "Adding device\n"); if (edev->pdev == dev) { - pr_debug("EEH: Already referenced !\n"); + eeh_edev_dbg(edev, "Device already referenced!\n"); return; } @@ -1212,7 +1191,6 @@ void eeh_add_device_late(struct pci_dev *dev) eeh_rmv_from_parent_pe(edev); eeh_addr_cache_rmv_dev(edev->pdev); eeh_sysfs_remove_device(edev->pdev); - edev->mode &= ~EEH_DEV_SYSFS; /* * We definitely should have the PCI device removed @@ -1246,6 +1224,8 @@ void eeh_add_device_tree_late(struct pci_bus *bus) { struct pci_dev *dev; + if (eeh_has_flag(EEH_FORCE_DISABLED)) + return; list_for_each_entry(dev, &bus->devices, bus_list) { eeh_add_device_late(dev); if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { @@ -1299,10 +1279,10 @@ void eeh_remove_device(struct pci_dev *dev) edev = pci_dev_to_eeh_dev(dev); /* Unregister the device with the EEH/PCI address search system */ - pr_debug("EEH: Removing device %s\n", pci_name(dev)); + dev_dbg(&dev->dev, "EEH: Removing device\n"); if (!edev || !edev->pdev || !edev->pe) { - pr_debug("EEH: Not referenced !\n"); + dev_dbg(&dev->dev, "EEH: Device not referenced!\n"); return; } @@ -1315,17 +1295,11 @@ void eeh_remove_device(struct pci_dev *dev) edev->pdev = NULL; /* - * The flag "in_error" is used to trace EEH devices for VFs - * in error state or not. It's set in eeh_report_error(). If - * it's not set, eeh_report_{reset,resume}() won't be called - * for the VF EEH device. + * eeh_sysfs_remove_device() uses pci_dev_to_eeh_dev() so we need to + * remove the sysfs files before clearing dev.archdata.edev */ - edev->in_error = false; - dev->dev.archdata.edev = NULL; - if (!(edev->pe->state & EEH_PE_KEEP)) - eeh_rmv_from_parent_pe(edev); - else - edev->mode |= EEH_DEV_DISCONNECTED; + if (edev->mode & EEH_DEV_SYSFS) + eeh_sysfs_remove_device(dev); /* * We're removing from the PCI subsystem, that means @@ -1336,8 +1310,19 @@ void eeh_remove_device(struct pci_dev *dev) edev->mode |= EEH_DEV_NO_HANDLER; eeh_addr_cache_rmv_dev(dev); - eeh_sysfs_remove_device(dev); - edev->mode &= ~EEH_DEV_SYSFS; + + /* + * The flag "in_error" is used to trace EEH devices for VFs + * in error state or not. It's set in eeh_report_error(). If + * it's not set, eeh_report_{reset,resume}() won't be called + * for the VF EEH device. + */ + edev->in_error = false; + dev->dev.archdata.edev = NULL; + if (!(edev->pe->state & EEH_PE_KEEP)) + eeh_rmv_from_parent_pe(edev); + else + edev->mode |= EEH_DEV_DISCONNECTED; } int eeh_unfreeze_pe(struct eeh_pe *pe) @@ -1890,6 +1875,198 @@ static const struct file_operations eeh_force_recover_fops = { .llseek = no_llseek, .write = eeh_force_recover_write, }; + +static ssize_t eeh_debugfs_dev_usage(struct file *filp, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + static const char usage[] = "input format: <domain>:<bus>:<dev>.<fn>\n"; + + return simple_read_from_buffer(user_buf, count, ppos, + usage, sizeof(usage) - 1); +} + +static ssize_t eeh_dev_check_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + struct eeh_dev *edev; + char buf[20]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); + if (!ret) + return -EFAULT; + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return -EINVAL; + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return -ENODEV; + + edev = pci_dev_to_eeh_dev(pdev); + if (!edev) { + pci_err(pdev, "No eeh_dev for this device!\n"); + pci_dev_put(pdev); + return -ENODEV; + } + + ret = eeh_dev_check_failure(edev); + pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n", + domain, bus, dev, fn, ret); + + pci_dev_put(pdev); + + return count; +} + +static const struct file_operations eeh_dev_check_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_check_write, + .read = eeh_debugfs_dev_usage, +}; + +static int eeh_debugfs_break_device(struct pci_dev *pdev) +{ + struct resource *bar = NULL; + void __iomem *mapped; + u16 old, bit; + int i, pos; + + /* Do we have an MMIO BAR to disable? */ + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + struct resource *r = &pdev->resource[i]; + + if (!r->flags || !r->start) + continue; + if (r->flags & IORESOURCE_IO) + continue; + if (r->flags & IORESOURCE_UNSET) + continue; + + bar = r; + break; + } + + if (!bar) { + pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n"); + return -ENXIO; + } + + pci_err(pdev, "Going to break: %pR\n", bar); + + if (pdev->is_virtfn) { +#ifndef CONFIG_PCI_IOV + return -ENXIO; +#else + /* + * VFs don't have a per-function COMMAND register, so the best + * we can do is clear the Memory Space Enable bit in the PF's + * SRIOV control reg. + * + * Unfortunately, this requires that we have a PF (i.e doesn't + * work for a passed-through VF) and it has the potential side + * effect of also causing an EEH on every other VF under the + * PF. Oh well. + */ + pdev = pdev->physfn; + if (!pdev) + return -ENXIO; /* passed through VFs have no PF */ + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); + pos += PCI_SRIOV_CTRL; + bit = PCI_SRIOV_CTRL_MSE; +#endif /* !CONFIG_PCI_IOV */ + } else { + bit = PCI_COMMAND_MEMORY; + pos = PCI_COMMAND; + } + + /* + * Process here is: + * + * 1. Disable Memory space. + * + * 2. Perform an MMIO to the device. This should result in an error + * (CA / UR) being raised by the device which results in an EEH + * PE freeze. Using the in_8() accessor skips the eeh detection hook + * so the freeze hook so the EEH Detection machinery won't be + * triggered here. This is to match the usual behaviour of EEH + * where the HW will asyncronously freeze a PE and it's up to + * the kernel to notice and deal with it. + * + * 3. Turn Memory space back on. This is more important for VFs + * since recovery will probably fail if we don't. For normal + * the COMMAND register is reset as a part of re-initialising + * the device. + * + * Breaking stuff is the point so who cares if it's racy ;) + */ + pci_read_config_word(pdev, pos, &old); + + mapped = ioremap(bar->start, PAGE_SIZE); + if (!mapped) { + pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar); + return -ENXIO; + } + + pci_write_config_word(pdev, pos, old & ~bit); + in_8(mapped); + pci_write_config_word(pdev, pos, old); + + iounmap(mapped); + + return 0; +} + +static ssize_t eeh_dev_break_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + uint32_t domain, bus, dev, fn; + struct pci_dev *pdev; + char buf[20]; + int ret; + + memset(buf, 0, sizeof(buf)); + ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count); + if (!ret) + return -EFAULT; + + ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn); + if (ret != 4) { + pr_err("%s: expected 4 args, got %d\n", __func__, ret); + return -EINVAL; + } + + pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn); + if (!pdev) + return -ENODEV; + + ret = eeh_debugfs_break_device(pdev); + pci_dev_put(pdev); + + if (ret < 0) + return ret; + + return count; +} + +static const struct file_operations eeh_dev_break_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_dev_break_write, + .read = eeh_debugfs_dev_usage, +}; + #endif static int __init eeh_init_proc(void) @@ -1905,6 +2082,12 @@ static int __init eeh_init_proc(void) debugfs_create_bool("eeh_disable_recovery", 0600, powerpc_debugfs_root, &eeh_debugfs_no_recover); + debugfs_create_file_unsafe("eeh_dev_check", 0600, + powerpc_debugfs_root, NULL, + &eeh_dev_check_fops); + debugfs_create_file_unsafe("eeh_dev_break", 0600, + powerpc_debugfs_root, NULL, + &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, powerpc_debugfs_root, NULL, &eeh_force_recover_fops); diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 05ffd32b3416..6b50bf15d8c1 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -148,8 +148,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, piar->pcidev = dev; piar->flags = flags; - pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", - &alo, &ahi, pci_name(dev)); + eeh_edev_dbg(piar->edev, "PIAR: insert range=[%pap:%pap]\n", + &alo, &ahi); rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -159,18 +159,10 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; int i; - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - if (!pdn) { - pr_warn("PCI: no pci dn found for dev=%s\n", - pci_name(dev)); - return; - } - - edev = pdn_to_eeh_dev(pdn); + edev = pci_dev_to_eeh_dev(dev); if (!edev) { pr_warn("PCI: no EEH dev found for %s\n", pci_name(dev)); @@ -229,8 +221,8 @@ restart: piar = rb_entry(n, struct pci_io_addr_range, rb_node); if (piar->pcidev == dev) { - pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", - &piar->addr_lo, &piar->addr_hi, pci_name(dev)); + eeh_edev_dbg(piar->edev, "PIAR: remove range=[%pap:%pap]\n", + &piar->addr_lo, &piar->addr_hi); rb_erase(n, &pci_io_addr_cache_root.rb_root); kfree(piar); goto restart; @@ -258,37 +250,14 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev) } /** - * eeh_addr_cache_build - Build a cache of I/O addresses + * eeh_addr_cache_init - Initialize a cache of I/O addresses * - * Build a cache of pci i/o addresses. This cache will be used to + * Initialize a cache of pci i/o addresses. This cache will be used to * find the pci device that corresponds to a given address. - * This routine scans all pci busses to build the cache. - * Must be run late in boot process, after the pci controllers - * have been scanned for devices (after all device resources are known). */ -void eeh_addr_cache_build(void) +void eeh_addr_cache_init(void) { - struct pci_dn *pdn; - struct eeh_dev *edev; - struct pci_dev *dev = NULL; - spin_lock_init(&pci_io_addr_cache_root.piar_lock); - - for_each_pci_dev(dev) { - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - if (!pdn) - continue; - - edev = pdn_to_eeh_dev(pdn); - if (!edev) - continue; - - dev->dev.archdata.edev = edev; - edev->pdev = dev; - - eeh_addr_cache_insert_dev(dev); - eeh_sysfs_add_device(dev); - } } static int eeh_addr_cache_show(struct seq_file *s, void *v) diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index c4317c452d98..7370185c7a05 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -47,6 +47,8 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; + edev->bdfn = (pdn->busno << 8) | pdn->devfn; + edev->controller = pdn->phb; return edev; } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 89623962c727..a1eaffe868de 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -1,25 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PCI Error Recovery Driver for RPA-compliant PPC64 platform. * Copyright IBM Corp. 2004 2005 * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> */ #include <linux/delay.h> @@ -27,6 +11,7 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/pci.h> +#include <linux/pci_hotplug.h> #include <asm/eeh.h> #include <asm/eeh_event.h> #include <asm/ppc-pci.h> @@ -81,23 +66,6 @@ static const char *pci_ers_result_name(enum pci_ers_result result) } }; -static __printf(2, 3) void eeh_edev_info(const struct eeh_dev *edev, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_INFO "EEH: PE#%x (PCI %s): %pV\n", edev->pe_config_addr, - edev->pdev ? dev_name(&edev->pdev->dev) : "none", &vaf); - - va_end(args); -} - static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, enum pci_ers_result new) { @@ -113,8 +81,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev) static bool eeh_edev_actionable(struct eeh_dev *edev) { - return (edev->pdev && !eeh_dev_removed(edev) && - !eeh_pe_passed(edev->pe)); + if (!edev->pdev) + return false; + if (edev->pdev->error_state == pci_channel_io_perm_failure) + return false; + if (eeh_dev_removed(edev)) + return false; + if (eeh_pe_passed(edev->pe)) + return false; + + return true; } /** @@ -214,12 +190,12 @@ static void eeh_enable_irq(struct eeh_dev *edev) } } -static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) +static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev; if (!edev) - return NULL; + return; /* * We cannot access the config space on some adapters. @@ -229,14 +205,13 @@ static void *eeh_dev_save_state(struct eeh_dev *edev, void *userdata) * device is created. */ if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) - return NULL; + return; pdev = eeh_dev_to_pci_dev(edev); if (!pdev) - return NULL; + return; pci_save_state(pdev); - return NULL; } static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) @@ -274,20 +249,27 @@ static void eeh_set_irq_state(struct eeh_pe *root, bool enable) } typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, + struct pci_dev *, struct pci_driver *); static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, enum pci_ers_result *result) { + struct pci_dev *pdev; struct pci_driver *driver; enum pci_ers_result new_result; - if (!edev->pdev) { + pci_lock_rescan_remove(); + pdev = edev->pdev; + if (pdev) + get_device(&pdev->dev); + pci_unlock_rescan_remove(); + if (!pdev) { eeh_edev_info(edev, "no device"); return; } - device_lock(&edev->pdev->dev); + device_lock(&pdev->dev); if (eeh_edev_actionable(edev)) { - driver = eeh_pcid_get(edev->pdev); + driver = eeh_pcid_get(pdev); if (!driver) eeh_edev_info(edev, "no driver"); @@ -296,7 +278,7 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, else if (edev->mode & EEH_DEV_NO_HANDLER) eeh_edev_info(edev, "driver bound too late"); else { - new_result = fn(edev, driver); + new_result = fn(edev, pdev, driver); eeh_edev_info(edev, "%s driver reports: '%s'", driver->name, pci_ers_result_name(new_result)); @@ -305,12 +287,15 @@ static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, new_result); } if (driver) - eeh_pcid_put(edev->pdev); + eeh_pcid_put(pdev); } else { - eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!edev->pdev, + eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); } - device_unlock(&edev->pdev->dev); + device_unlock(&pdev->dev); + if (edev->pdev != pdev) + eeh_edev_warn(edev, "Device changed during processing!\n"); + put_device(&pdev->dev); } static void eeh_pe_report(const char *name, struct eeh_pe *root, @@ -337,20 +322,20 @@ static void eeh_pe_report(const char *name, struct eeh_pe *root, * Report an EEH error to each device driver. */ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { enum pci_ers_result rc; - struct pci_dev *dev = edev->pdev; if (!driver->err_handler->error_detected) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", driver->name); - rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); + rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); edev->in_error = true; - pci_uevent_ers(dev, PCI_ERS_RESULT_NONE); + pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); return rc; } @@ -363,12 +348,13 @@ static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, * are now enabled. */ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->mmio_enabled) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); - return driver->err_handler->mmio_enabled(edev->pdev); + return driver->err_handler->mmio_enabled(pdev); } /** @@ -382,20 +368,21 @@ static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, * driver can work again while the device is recovered. */ static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->slot_reset || !edev->in_error) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); - return driver->err_handler->slot_reset(edev->pdev); + return driver->err_handler->slot_reset(pdev); } -static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) +static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) { struct pci_dev *pdev; if (!edev) - return NULL; + return; /* * The content in the config space isn't saved because @@ -407,15 +394,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) if (list_is_last(&edev->entry, &edev->pe->edevs)) eeh_pe_restore_bars(edev->pe); - return NULL; + return; } pdev = eeh_dev_to_pci_dev(edev); if (!pdev) - return NULL; + return; pci_restore_state(pdev); - return NULL; } /** @@ -428,13 +414,14 @@ static void *eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) * to make the recovered device work again. */ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { if (!driver->err_handler->resume || !edev->in_error) return PCI_ERS_RESULT_NONE; eeh_edev_info(edev, "Invoking %s->resume()", driver->name); - driver->err_handler->resume(edev->pdev); + driver->err_handler->resume(pdev); pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); #ifdef CONFIG_PCI_IOV @@ -453,6 +440,7 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, * dead, and that no further recovery attempts will be made on it. */ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, + struct pci_dev *pdev, struct pci_driver *driver) { enum pci_ers_result rc; @@ -462,10 +450,10 @@ static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", driver->name); - rc = driver->err_handler->error_detected(edev->pdev, + rc = driver->err_handler->error_detected(pdev, pci_channel_io_perm_failure); - pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_DISCONNECT); + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); return rc; } @@ -473,12 +461,9 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) { struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); - struct pci_dn *pdn = eeh_dev_to_pdn(edev); if (!(edev->physfn)) { - pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", - __func__, pdn->phb->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); + eeh_edev_warn(edev, "Not for VF\n"); return NULL; } @@ -492,12 +477,12 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) } #ifdef CONFIG_PCI_IOV - pci_iov_add_virtfn(edev->physfn, pdn->vf_index); + pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); #endif return NULL; } -static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) +static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) { struct pci_driver *driver; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); @@ -512,7 +497,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) */ if (!eeh_edev_actionable(edev) || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; + return; if (rmv_data) { driver = eeh_pcid_get(dev); @@ -521,7 +506,7 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) driver->err_handler->error_detected && driver->err_handler->slot_reset) { eeh_pcid_put(dev); - return NULL; + return; } eeh_pcid_put(dev); } @@ -540,12 +525,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); edev->pdev = NULL; - - /* - * We have to set the VF PE number to invalid one, which is - * required to plug the VF successfully. - */ - pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); @@ -554,8 +533,6 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) pci_stop_and_remove_bus_device(dev); pci_unlock_rescan_remove(); } - - return NULL; } static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) @@ -744,6 +721,99 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, */ #define MAX_WAIT_FOR_RECOVERY 300 + +/* Walks the PE tree after processing an event to remove any stale PEs. + * + * NB: This needs to be recursive to ensure the leaf PEs get removed + * before their parents do. Although this is possible to do recursively + * we don't since this is easier to read and we need to garantee + * the leaf nodes will be handled first. + */ +static void eeh_pe_cleanup(struct eeh_pe *pe) +{ + struct eeh_pe *child_pe, *tmp; + + list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) + eeh_pe_cleanup(child_pe); + + if (pe->state & EEH_PE_KEEP) + return; + + if (!(pe->state & EEH_PE_INVALID)) + return; + + if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } +} + +/** + * eeh_check_slot_presence - Check if a device is still present in a slot + * @pdev: pci_dev to check + * + * This function may return a false positive if we can't determine the slot's + * presence state. This might happen for for PCIe slots if the PE containing + * the upstream bridge is also frozen, or the bridge is part of the same PE + * as the device. + * + * This shouldn't happen often, but you might see it if you hotplug a PCIe + * switch. + */ +static bool eeh_slot_presence_check(struct pci_dev *pdev) +{ + const struct hotplug_slot_ops *ops; + struct pci_slot *slot; + u8 state; + int rc; + + if (!pdev) + return false; + + if (pdev->error_state == pci_channel_io_perm_failure) + return false; + + slot = pdev->slot; + if (!slot || !slot->hotplug) + return true; + + ops = slot->hotplug->ops; + if (!ops || !ops->get_adapter_status) + return true; + + /* set the attention indicator while we've got the slot ops */ + if (ops->set_attention_status) + ops->set_attention_status(slot->hotplug, 1); + + rc = ops->get_adapter_status(slot->hotplug, &state); + if (rc) + return true; + + return !!state; +} + +static void eeh_clear_slot_attention(struct pci_dev *pdev) +{ + const struct hotplug_slot_ops *ops; + struct pci_slot *slot; + + if (!pdev) + return; + + if (pdev->error_state == pci_channel_io_perm_failure) + return; + + slot = pdev->slot; + if (!slot || !slot->hotplug) + return; + + ops = slot->hotplug->ops; + if (!ops || !ops->set_attention_status) + return; + + ops->set_attention_status(slot->hotplug, 0); +} + /** * eeh_handle_normal_event - Handle EEH events on a specific PE * @pe: EEH PE - which should not be used after we return, as it may @@ -774,6 +844,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) enum pci_ers_result result = PCI_ERS_RESULT_NONE; struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; + int devices = 0; bus = eeh_pe_bus_get(pe); if (!bus) { @@ -782,7 +853,59 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } - eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + /* + * When devices are hot-removed we might get an EEH due to + * a driver attempting to touch the MMIO space of a removed + * device. In this case we don't have a device to recover + * so suppress the event if we can't find any present devices. + * + * The hotplug driver should take care of tearing down the + * device itself. + */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + if (eeh_slot_presence_check(edev->pdev)) + devices++; + + if (!devices) { + pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", + pe->phb->global_number, pe->addr); + goto out; /* nothing to recover */ + } + + /* Log the event */ + if (pe->type & EEH_PE_PHB) { + pr_err("EEH: Recovering PHB#%x, location: %s\n", + pe->phb->global_number, eeh_pe_loc_get(pe)); + } else { + struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); + + pr_err("EEH: Recovering PHB#%x-PE#%x\n", + pe->phb->global_number, pe->addr); + pr_err("EEH: PE location: %s, PHB location: %s\n", + eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); + } + +#ifdef CONFIG_STACKTRACE + /* + * Print the saved stack trace now that we've verified there's + * something to recover. + */ + if (pe->trace_entries) { + void **ptrs = (void **) pe->stack_trace; + int i; + + pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", + pe->phb->global_number, pe->addr); + + /* FIXME: Use the same format as dump_stack() */ + pr_err("EEH: Call Trace:\n"); + for (i = 0; i < pe->trace_entries; i++) + pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); + + pe->trace_entries = 0; + } +#endif /* CONFIG_STACKTRACE */ eeh_pe_update_time_stamp(pe); pe->freeze_count++; @@ -793,6 +916,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) result = PCI_ERS_RESULT_DISCONNECT; } + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + edev->mode &= ~EEH_DEV_NO_HANDLER; + /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the @@ -969,6 +1096,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } } + +out: + /* + * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING + * we don't want to modify the PE tree structure so we do it here. + */ + eeh_pe_cleanup(pe); + + /* clear the slot attention LED for all recovered devices */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + eeh_clear_slot_attention(edev->pdev); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } @@ -981,7 +1121,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) */ void eeh_handle_special_event(void) { - struct eeh_pe *pe, *phb_pe; + struct eeh_pe *pe, *phb_pe, *tmp_pe; + struct eeh_dev *edev, *tmp_edev; struct pci_bus *bus; struct pci_controller *hose; unsigned long flags; @@ -1040,6 +1181,7 @@ void eeh_handle_special_event(void) */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); eeh_handle_normal_event(pe); } else { pci_lock_rescan_remove(); @@ -1050,6 +1192,10 @@ void eeh_handle_special_event(void) (phb_pe->state & EEH_PE_RECOVERING)) continue; + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) + edev->mode &= ~EEH_DEV_NO_HANDLER; + /* Notify all devices to be down */ eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_set_channel_state(pe, pci_channel_io_perm_failure); diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 64cfbe41174b..a7a8dc182efb 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -40,7 +40,6 @@ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; - struct eeh_pe *pe; while (!kthread_should_stop()) { if (wait_for_completion_interruptible(&eeh_eventlist_event)) @@ -59,19 +58,10 @@ static int eeh_event_handler(void * dummy) continue; /* We might have event without binding PE */ - pe = event->pe; - if (pe) { - if (pe->type & EEH_PE_PHB) - pr_info("EEH: Detected error on PHB#%x\n", - pe->phb->global_number); - else - pr_info("EEH: Detected PCI bus error on " - "PHB#%x-PE#%x\n", - pe->phb->global_number, pe->addr); - eeh_handle_normal_event(pe); - } else { + if (event->pe) + eeh_handle_normal_event(event->pe); + else eeh_handle_special_event(); - } kfree(event); } @@ -121,6 +111,24 @@ int __eeh_send_failure_event(struct eeh_pe *pe) } event->pe = pe; + /* + * Mark the PE as recovering before inserting it in the queue. + * This prevents the PE from being free()ed by a hotplug driver + * while the PE is sitting in the event queue. + */ + if (pe) { +#ifdef CONFIG_STACKTRACE + /* + * Save the current stack trace so we can dump it from the + * event handler thread. + */ + pe->trace_entries = stack_trace_save(pe->stack_trace, + ARRAY_SIZE(pe->stack_trace), 0); +#endif /* CONFIG_STACKTRACE */ + + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + } + /* We may or may not be called in an interrupt context */ spin_lock_irqsave(&eeh_eventlist_lock, flags); list_add(&event->list, &eeh_eventlist); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 854cef7b18f4..177852e39a25 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -231,29 +231,22 @@ void *eeh_pe_traverse(struct eeh_pe *root, * The function is used to traverse the devices of the specified * PE and its child PEs. */ -void *eeh_pe_dev_traverse(struct eeh_pe *root, +void eeh_pe_dev_traverse(struct eeh_pe *root, eeh_edev_traverse_func fn, void *flag) { struct eeh_pe *pe; struct eeh_dev *edev, *tmp; - void *ret; if (!root) { pr_warn("%s: Invalid PE %p\n", __func__, root); - return NULL; + return; } /* Traverse root PE */ - eeh_for_each_pe(root, pe) { - eeh_pe_for_each_dev(pe, edev, tmp) { - ret = fn(edev, flag); - if (ret) - return ret; - } - } - - return NULL; + eeh_for_each_pe(root, pe) + eeh_pe_for_each_dev(pe, edev, tmp) + fn(edev, flag); } /** @@ -379,8 +372,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { - pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", - __func__, config_addr, pdn->phb->global_number); + eeh_edev_err(edev, "PE#0 is invalid for this PHB!\n"); return -EINVAL; } @@ -391,42 +383,34 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * components. */ pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); - if (pe && !(pe->type & EEH_PE_INVALID)) { - /* Mark the PE as type of PCI bus */ - pe->type = EEH_PE_BUS; - edev->pe = pe; - - /* Put the edev to PE */ - list_add_tail(&edev->entry, &pe->edevs); - pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr); - return 0; - } else if (pe && (pe->type & EEH_PE_INVALID)) { - list_add_tail(&edev->entry, &pe->edevs); - edev->pe = pe; - /* - * We're running to here because of PCI hotplug caused by - * EEH recovery. We need clear EEH_PE_INVALID until the top. - */ - parent = pe; - while (parent) { - if (!(parent->type & EEH_PE_INVALID)) - break; - parent->type &= ~EEH_PE_INVALID; - parent = parent->parent; - } + if (pe) { + if (pe->type & EEH_PE_INVALID) { + list_add_tail(&edev->entry, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~EEH_PE_INVALID; + parent = parent->parent; + } + + eeh_edev_dbg(edev, + "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); + } else { + /* Mark the PE as type of PCI bus */ + pe->type = EEH_PE_BUS; + edev->pe = pe; - pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " - "PE#%x, Parent PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr, pe->parent->addr); + /* Put the edev to PE */ + list_add_tail(&edev->entry, &pe->edevs); + eeh_edev_dbg(edev, "Added to bus PE\n"); + } return 0; } @@ -468,13 +452,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) list_add_tail(&pe->child, &parent->child_list); list_add_tail(&edev->entry, &pe->edevs); edev->pe = pe; - pr_debug("EEH: Add %04x:%02x:%02x.%01x to " - "Device PE#%x, Parent PE#%x\n", - pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn), - pe->addr, pe->parent->addr); + eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n", + pe->parent->addr); return 0; } @@ -491,16 +470,12 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; + bool keep, recover; int cnt; - struct pci_dn *pdn = eeh_dev_to_pdn(edev); pe = eeh_dev_to_pe(edev); if (!pe) { - pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", - __func__, pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn)); + eeh_edev_dbg(edev, "No PE found for device.\n"); return -EEXIST; } @@ -516,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ while (1) { parent = pe->parent; + + /* PHB PEs should never be removed */ if (pe->type & EEH_PE_PHB) break; - if (!(pe->state & EEH_PE_KEEP)) { + /* + * XXX: KEEP is set while resetting a PE. I don't think it's + * ever set without RECOVERING also being set. I could + * be wrong though so catch that with a WARN. + */ + keep = !!(pe->state & EEH_PE_KEEP); + recover = !!(pe->state & EEH_PE_RECOVERING); + WARN_ON(keep && !recover); + + if (!keep && !recover) { if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { list_del(&pe->child); @@ -528,6 +514,15 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) break; } } else { + /* + * Mark the PE as invalid. At the end of the recovery + * process any invalid PEs will be garbage collected. + * + * We need to delay the free()ing of them since we can + * remove edev's while traversing the PE tree which + * might trigger the removal of a PE and we can't + * deal with that (yet). + */ if (list_empty(&pe->edevs)) { cnt = 0; list_for_each_entry(child, &pe->child_list, child) { @@ -623,13 +618,11 @@ void eeh_pe_mark_isolated(struct eeh_pe *root) } EXPORT_SYMBOL_GPL(eeh_pe_mark_isolated); -static void *__eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) +static void __eeh_pe_dev_mode_mark(struct eeh_dev *edev, void *flag) { int mode = *((int *)flag); edev->mode |= mode; - - return NULL; } /** @@ -717,17 +710,13 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) if (!(edev->mode & (EEH_DEV_ROOT_PORT | EEH_DEV_DS_PORT))) return; - pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", - __func__, pdn->phb->global_number, - pdn->busno, - PCI_SLOT(pdn->devfn), - PCI_FUNC(pdn->devfn)); + eeh_edev_dbg(edev, "Checking PCIe link...\n"); /* Check slot status */ cap = edev->pcie_cap; eeh_ops->read_config(pdn, cap + PCI_EXP_SLTSTA, 2, &val); if (!(val & PCI_EXP_SLTSTA_PDS)) { - pr_debug(" No card in the slot (0x%04x) !\n", val); + eeh_edev_dbg(edev, "No card in the slot (0x%04x) !\n", val); return; } @@ -736,7 +725,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) if (val & PCI_EXP_SLTCAP_PCP) { eeh_ops->read_config(pdn, cap + PCI_EXP_SLTCTL, 2, &val); if (val & PCI_EXP_SLTCTL_PCC) { - pr_debug(" In power-off state, power it on ...\n"); + eeh_edev_dbg(edev, "In power-off state, power it on ...\n"); val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); val |= (0x0100 & PCI_EXP_SLTCTL_PIC); eeh_ops->write_config(pdn, cap + PCI_EXP_SLTCTL, 2, val); @@ -752,7 +741,7 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) /* Check link */ eeh_ops->read_config(pdn, cap + PCI_EXP_LNKCAP, 4, &val); if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { - pr_debug(" No link reporting capability (0x%08x) \n", val); + eeh_edev_dbg(edev, "No link reporting capability (0x%08x) \n", val); msleep(1000); return; } @@ -769,10 +758,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) } if (val & PCI_EXP_LNKSTA_DLLLA) - pr_debug(" Link up (%s)\n", + eeh_edev_dbg(edev, "Link up (%s)\n", (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); else - pr_debug(" Link not ready (0x%04x)\n", val); + eeh_edev_dbg(edev, "Link not ready (0x%04x)\n", val); } #define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) @@ -852,7 +841,7 @@ static void eeh_restore_device_bars(struct eeh_dev *edev) * the expansion ROM base address, the latency timer, and etc. * from the saved values in the device node. */ -static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) +static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) { struct pci_dn *pdn = eeh_dev_to_pdn(edev); @@ -864,8 +853,6 @@ static void *eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag) if (eeh_ops->restore_config && pdn) eeh_ops->restore_config(pdn); - - return NULL; } /** diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 3fa04dda1737..4fb0f1e1017a 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -1,25 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. * Copyright IBM Corporation 2007 * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> */ #include <linux/pci.h> @@ -30,7 +14,7 @@ /** * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic * @_name: name of file in sysfs directory - * @_memb: name of member in struct pci_dn to access + * @_memb: name of member in struct eeh_dev to access * @_format: printf format for display * * All of the attributes look very similar, so just @@ -91,7 +75,7 @@ static ssize_t eeh_pe_state_store(struct device *dev, static DEVICE_ATTR_RW(eeh_pe_state); -#ifdef CONFIG_PCI_IOV +#if defined(CONFIG_PCI_IOV) && defined(CONFIG_PPC_PSERIES) static ssize_t eeh_notify_resume_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -102,7 +86,6 @@ static ssize_t eeh_notify_resume_show(struct device *dev, if (!edev || !edev->pe) return -ENODEV; - pdn = pci_get_pdn(pdev); return sprintf(buf, "%d\n", pdn->last_allow_rc); } @@ -148,7 +131,7 @@ static void eeh_notify_resume_remove(struct pci_dev *pdev) #else static inline int eeh_notify_resume_add(struct pci_dev *pdev) { return 0; } static inline void eeh_notify_resume_remove(struct pci_dev *pdev) { } -#endif /* CONFIG_PCI_IOV */ +#endif /* CONFIG_PCI_IOV && CONFIG PPC_PSERIES*/ void eeh_sysfs_add_device(struct pci_dev *pdev) { @@ -176,22 +159,23 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) { struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); + if (!edev) { + WARN_ON(eeh_enabled()); + return; + } + + edev->mode &= ~EEH_DEV_SYSFS; + /* * The parent directory might have been removed. We needn't * continue for that case. */ - if (!pdev->dev.kobj.sd) { - if (edev) - edev->mode &= ~EEH_DEV_SYSFS; + if (!pdev->dev.kobj.sd) return; - } device_remove_file(&pdev->dev, &dev_attr_eeh_mode); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state); eeh_notify_resume_remove(pdev); - - if (edev) - edev->mode &= ~EEH_DEV_SYSFS; } diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 54fab22c9a43..0713daa651d9 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -140,6 +140,7 @@ transfer_to_handler: stw r12,_CTR(r11) stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD + tovirt_vmstack r12, r12 beq 2f /* if from user, fix up THREAD.regs */ addi r2, r12, -THREAD addi r11,r1,STACK_FRAME_OVERHEAD @@ -179,11 +180,13 @@ transfer_to_handler: 2: /* if from kernel, check interrupted DOZE/NAP mode and * check for stack overflow */ - kuap_save_and_lock r11, r12, r9, r2, r0 + kuap_save_and_lock r11, r12, r9, r2, r6 addi r2, r12, -THREAD +#ifndef CONFIG_VMAP_STACK lwz r9,KSP_LIMIT(r12) cmplw r1,r9 /* if r1 <= ksp_limit */ ble- stack_ovf /* then the kernel stack overflowed */ +#endif 5: #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) lwz r12,TI_LOCAL_FLAGS(r2) @@ -195,7 +198,8 @@ transfer_to_handler: transfer_to_handler_cont: 3: mflr r9 - tovirt(r2, r2) /* set r2 to current */ + tovirt_novmstack r2, r2 /* set r2 to current */ + tovirt_vmstack r9, r9 lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9) /* where to go when done */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) @@ -210,7 +214,7 @@ transfer_to_handler_cont: * To speed up the syscall path where interrupts stay on, let's check * first if we are changing the MSR value at all. */ - tophys(r12, r1) + tophys_novmstack r12, r1 lwz r12,_MSR(r12) andi. r12,r12,MSR_EE bne 1f @@ -230,7 +234,7 @@ transfer_to_handler_cont: */ lis r12,reenable_mmu@h ori r12,r12,reenable_mmu@l - LOAD_MSR_KERNEL(r0, MSR_KERNEL) + LOAD_REG_IMMEDIATE(r0, MSR_KERNEL) mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r0 SYNC @@ -284,9 +288,11 @@ reenable_mmu: rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ kuap_restore r11, r2, r3, r4, r5 + lwz r2, GPR2(r11) b fast_exception_return #endif +#ifndef CONFIG_VMAP_STACK /* * On kernel stack overflow, load up an initial stack pointer * and call StackOverflow(regs), which should not return. @@ -304,7 +310,7 @@ stack_ovf: addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD lis r9,StackOverflow@ha addi r9,r9,StackOverflow@l - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 #endif @@ -312,6 +318,7 @@ stack_ovf: mtspr SPRN_SRR1,r10 SYNC RFI +#endif #ifdef CONFIG_TRACE_IRQFLAGS trace_syscall_entry_irq_off: @@ -324,7 +331,7 @@ trace_syscall_entry_irq_off: bl trace_hardirqs_on /* Now enable for real */ - LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) mtmsr r10 REST_GPR(0, r1) @@ -394,10 +401,10 @@ ret_from_syscall: #endif mr r6,r3 /* disable interrupts so current_thread_info()->flags can't change */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC - MTMSRD(r10) + mtmsr r10 lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) @@ -554,7 +561,7 @@ syscall_exit_work: */ ori r10,r10,MSR_EE SYNC - MTMSRD(r10) + mtmsr r10 /* Save NVGPRS if they're not saved already */ lwz r4,_TRAP(r1) @@ -621,7 +628,6 @@ ppc_swapcontext: */ .globl handle_page_fault handle_page_fault: - stw r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_BOOK3S_32 andis. r0,r5,DSISR_DABRMATCH@h @@ -697,7 +703,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) and. r0,r0,r11 /* FP or altivec or SPE enabled? */ beq+ 1f andc r11,r11,r0 - MTMSRD(r11) + mtmsr r11 isync 1: stw r11,_MSR(r1) mfcr r10 @@ -777,11 +783,19 @@ fast_exception_return: 1: lis r3,exc_exit_restart_end@ha addi r3,r3,exc_exit_restart_end@l cmplw r12,r3 +#if CONFIG_PPC_BOOK3S_601 + bge 2b +#else bge 3f +#endif lis r4,exc_exit_restart@ha addi r4,r4,exc_exit_restart@l cmplw r12,r4 +#if CONFIG_PPC_BOOK3S_601 + blt 2b +#else blt 3f +#endif lis r3,fee_restarts@ha tophys(r3,r3) lwz r5,fee_restarts@l(r3) @@ -800,9 +814,6 @@ fee_restarts: /* aargh, we don't know which trap this is */ /* but the 601 doesn't implement the RI bit, so assume it's OK */ 3: -BEGIN_FTR_SECTION - b 2b -END_FTR_SECTION_IFSET(CPU_FTR_601) li r10,-1 stw r10,_TRAP(r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -824,9 +835,9 @@ ret_from_except: * can't change between when we test it and when we return * from the interrupt. */ /* Note: We don't bother telling lockdep about it */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC /* Some chip revs have problems here... */ - MTMSRD(r10) /* disable interrupts */ + mtmsr r10 /* disable interrupts */ lwz r3,_MSR(r1) /* Returning to user mode? */ andi. r0,r3,MSR_PR @@ -892,7 +903,7 @@ resume_kernel: bne- 0b 1: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* check current_thread_info->preempt_count */ lwz r0,TI_PREEMPT(r2) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ @@ -916,7 +927,7 @@ resume_kernel: */ bl trace_hardirqs_on #endif -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ restore_kuap: kuap_restore r1, r2, r9, r10, r0 @@ -991,9 +1002,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) * can restart the exception exit path at the label * exc_exit_restart below. -- paulus */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL & ~MSR_RI) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL & ~MSR_RI) SYNC - MTMSRD(r10) /* clear the RI bit */ + mtmsr r10 /* clear the RI bit */ .globl exc_exit_restart exc_exit_restart: lwz r12,_NIP(r1) @@ -1066,7 +1077,7 @@ exc_exit_restart_end: REST_NVGPRS(r1); \ lwz r3,_MSR(r1); \ andi. r3,r3,MSR_PR; \ - LOAD_MSR_KERNEL(r10,MSR_KERNEL); \ + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL); \ bne user_exc_return; \ lwz r0,GPR0(r1); \ lwz r2,GPR2(r1); \ @@ -1229,16 +1240,16 @@ do_resched: /* r10 contains MSR_KERNEL here */ #endif ori r10,r10,MSR_EE SYNC - MTMSRD(r10) /* hard-enable interrupts */ + mtmsr r10 /* hard-enable interrupts */ bl schedule recheck: /* Note: And we don't tell it we are disabling them again * neither. Those disable/enable cycles used to peek at * TI_FLAGS aren't advertised. */ - LOAD_MSR_KERNEL(r10,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) SYNC - MTMSRD(r10) /* disable interrupts */ + mtmsr r10 /* disable interrupts */ lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED bne- do_resched @@ -1247,7 +1258,7 @@ recheck: do_user_signal: /* r10 contains MSR_KERNEL here */ ori r10,r10,MSR_EE SYNC - MTMSRD(r10) /* hard-enable interrupts */ + mtmsr r10 /* hard-enable interrupts */ /* save r13-r31 in the exception frame, if not already done */ lwz r3,_TRAP(r1) andi. r0,r3,1 @@ -1270,11 +1281,19 @@ nonrecoverable: lis r10,exc_exit_restart_end@ha addi r10,r10,exc_exit_restart_end@l cmplw r12,r10 +#ifdef CONFIG_PPC_BOOK3S_601 + bgelr +#else bge 3f +#endif lis r11,exc_exit_restart@ha addi r11,r11,exc_exit_restart@l cmplw r12,r11 +#ifdef CONFIG_PPC_BOOK3S_601 + bltlr +#else blt 3f +#endif lis r10,ee_restarts@ha lwz r12,ee_restarts@l(r10) addi r12,r12,1 @@ -1283,9 +1302,6 @@ nonrecoverable: blr 3: /* OK, we can't recover, kill this process */ /* but the 601 doesn't implement the RI bit, so assume it's OK */ -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFSET(CPU_FTR_601) lwz r3,_TRAP(r1) andi. r0,r3,1 beq 5f @@ -1324,14 +1340,14 @@ _GLOBAL(enter_rtas) lis r6,1f@ha /* physical return address for rtas */ addi r6,r6,1f@l tophys(r6,r6) - tophys(r7,r1) + tophys_novmstack r7, r1 lwz r8,RTASENTRY(r4) lwz r4,RTASBASE(r4) mfmsr r9 stw r9,8(r1) - LOAD_MSR_KERNEL(r0,MSR_KERNEL) + LOAD_REG_IMMEDIATE(r0,MSR_KERNEL) SYNC /* disable interrupts so SRR0/1 */ - MTMSRD(r0) /* don't get trashed */ + mtmsr r0 /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 stw r7, THREAD + RTAS_SP(r2) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 0a0b5310f54a..6ba675b0cf7d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -69,24 +69,20 @@ BEGIN_FTR_SECTION bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif - andi. r10,r12,MSR_PR mr r10,r1 - addi r1,r1,-INT_FRAME_SIZE - beq- 1f ld r1,PACAKSAVE(r13) -1: std r10,0(r1) + std r10,0(r1) std r11,_NIP(r1) std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) - beq 2f /* if from kernel mode */ #ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION BTB_FLUSH(r10) END_BTB_FLUSH_SECTION #endif ACCOUNT_CPU_USER_ENTRY(r13, r10, r11) -2: std r2,GPR2(r1) + std r2,GPR2(r1) std r3,GPR3(r1) mfcr r2 std r4,GPR4(r1) @@ -122,14 +118,13 @@ END_BTB_FLUSH_SECTION #if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) BEGIN_FW_FTR_SECTION - beq 33f - /* if from user, see if there are any DTL entries to process */ + /* see if there are any DTL entries to process */ ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ ld r11,PACA_DTL_RIDX(r13) /* get log read index */ addi r10,r10,LPPACA_DTLIDX LDX_BE r10,0,r10 /* get log write index */ - cmpd cr1,r11,r10 - beq+ cr1,33f + cmpd r11,r10 + beq+ 33f bl accumulate_stolen_time REST_GPR(0,r1) REST_4GPRS(3,r1) @@ -203,6 +198,7 @@ system_call: /* label this so stack traces look sane */ mtctr r12 bctrl /* Call handler */ + /* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */ .Lsyscall_exit: std r3,RESULT(r1) @@ -216,11 +212,6 @@ system_call: /* label this so stack traces look sane */ ld r12, PACA_THREAD_INFO(r13) ld r8,_MSR(r1) -#ifdef CONFIG_PPC_BOOK3S - /* No MSR:RI on BookE */ - andi. r10,r8,MSR_RI - beq- .Lunrecov_restore -#endif /* * This is a few instructions into the actual syscall exit path (which actually @@ -546,6 +537,7 @@ flush_count_cache: /* Save LR into r9 */ mflr r9 + // Flush the link stack .rept 64 bl .+4 .endr @@ -555,6 +547,11 @@ flush_count_cache: .balign 32 /* Restore LR */ 1: mtlr r9 + + // If we're just flushing the link stack, return here +3: nop + patch_site 3b patch__flush_link_stack_return + li r9,0x7fff mtctr r9 @@ -600,8 +597,7 @@ _GLOBAL(_switch) std r0,16(r1) stdu r1,-SWITCH_FRAME_SIZE(r1) /* r3-r13 are caller saved -- Cort */ - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) + SAVE_NVGPRS(r1) std r0,_NIP(r1) /* Return to switch caller */ mfcr r23 std r23,_CCR(r1) @@ -725,8 +721,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtcrf 0xFF,r6 /* r3-r13 are destroyed -- Cort */ - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) + REST_NVGPRS(r1) /* convert old thread to its task_struct for return value */ addi r3,r3,-THREAD @@ -849,7 +844,7 @@ resume_kernel: bne- 0b 1: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* Check if we need to preempt */ andi. r0,r4,_TIF_NEED_RESCHED beq+ restore @@ -880,7 +875,7 @@ resume_kernel: li r10,MSR_RI mtmsrd r10,1 /* Update machine state */ #endif /* CONFIG_PPC_BOOK3E */ -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ .globl fast_exc_return_irq fast_exc_return_irq: @@ -1158,8 +1153,7 @@ _GLOBAL(enter_rtas) */ SAVE_GPR(2, r1) /* Save the TOC */ SAVE_GPR(13, r1) /* Save paca */ - SAVE_8GPRS(14, r1) /* Save the non-volatiles */ - SAVE_10GPRS(22, r1) /* ditto */ + SAVE_NVGPRS(r1) /* Save the non-volatiles */ mfcr r4 std r4,_CCR(r1) @@ -1266,8 +1260,7 @@ rtas_restore_regs: /* relocation is on at this point */ REST_GPR(2, r1) /* Restore the TOC */ REST_GPR(13, r1) /* Restore paca */ - REST_8GPRS(14, r1) /* Restore the non-volatiles */ - REST_10GPRS(22, r1) /* ditto */ + REST_NVGPRS(r1) /* Restore the non-volatiles */ GET_PACA(r13) @@ -1301,8 +1294,7 @@ _GLOBAL(enter_prom) */ SAVE_GPR(2, r1) SAVE_GPR(13, r1) - SAVE_8GPRS(14, r1) - SAVE_10GPRS(22, r1) + SAVE_NVGPRS(r1) mfcr r10 mfmsr r11 std r10,_CCR(r1) @@ -1346,8 +1338,7 @@ _GLOBAL(enter_prom) /* Restore other registers */ REST_GPR(2, r1) REST_GPR(13, r1) - REST_8GPRS(14, r1) - REST_10GPRS(22, r1) + REST_NVGPRS(r1) ld r4,_CCR(r1) mtcr r4 diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1cfb3da4a84a..e4076e3c072d 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -750,12 +750,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) ld r15,PACATOC(r13) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) -#else - LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE(r15,__end_interrupts) -#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 +#else + LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e) + cmpld cr0, r10, r14 + LOAD_REG_IMMEDIATE_SYM(r14, r15, __end_interrupts) + cmpld cr1, r10, r14 +#endif blt+ cr0,1f bge+ cr1,1f @@ -820,12 +822,14 @@ kernel_dbg_exc: ld r15,PACATOC(r13) ld r14,interrupt_base_book3e@got(r15) ld r15,__end_interrupts@got(r15) -#else - LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) - LOAD_REG_IMMEDIATE(r15,__end_interrupts) -#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 +#else + LOAD_REG_IMMEDIATE_SYM(r14, r15, interrupt_base_book3e) + cmpld cr0, r10, r14 + LOAD_REG_IMMEDIATE_SYM(r14, r15,__end_interrupts) + cmpld cr1, r10, r14 +#endif blt+ cr0,1f bge+ cr1,1f @@ -1342,16 +1346,6 @@ skpinv: addi r6,r6,1 /* Increment */ sync isync -/* - * The mapping only needs to be cache-coherent on SMP, except on - * Freescale e500mc derivatives where it's also needed for coherent DMA. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) -#define M_IF_NEEDED MAS2_M -#else -#define M_IF_NEEDED 0 -#endif - /* 6. Setup KERNELBASE mapping in TLB[0] * * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in @@ -1364,7 +1358,7 @@ skpinv: addi r6,r6,1 /* Increment */ ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l mtspr SPRN_MAS1,r6 - LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET | M_IF_NEEDED) + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET | MAS2_M_IF_NEEDED) mtspr SPRN_MAS2,r6 rlwinm r5,r5,0,0,25 @@ -1449,7 +1443,7 @@ a2_tlbinit_code_start: a2_tlbinit_after_linear_map: /* Now we branch the new virtual address mapped by this entry */ - LOAD_REG_IMMEDIATE(r3,1f) + LOAD_REG_IMMEDIATE_SYM(r3, r5, 1f) mtctr r3 bctr diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6ba3cc2ef8ab..ffc15f4f079d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -44,6 +44,58 @@ #endif /* + * Following are fixed section helper macros. + * + * EXC_REAL_BEGIN/END - real, unrelocated exception vectors + * EXC_VIRT_BEGIN/END - virt (AIL), unrelocated exception vectors + * TRAMP_REAL_BEGIN - real, unrelocated helpers (virt may call these) + * TRAMP_VIRT_BEGIN - virt, unreloc helpers (in practice, real can use) + * TRAMP_KVM_BEGIN - KVM handlers, these are put into real, unrelocated + * EXC_COMMON - After switching to virtual, relocated mode. + */ + +#define EXC_REAL_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size) + +#define EXC_REAL_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size) + +#define EXC_VIRT_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) + +#define EXC_VIRT_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) + +#define EXC_COMMON_BEGIN(name) \ + USE_TEXT_SECTION(); \ + .balign IFETCH_ALIGN_BYTES; \ + .global name; \ + _ASM_NOKPROBE_SYMBOL(name); \ + DEFINE_FIXED_SYMBOL(name); \ +name: + +#define TRAMP_REAL_BEGIN(name) \ + FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name) + +#define TRAMP_VIRT_BEGIN(name) \ + FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#define TRAMP_KVM_BEGIN(name) \ + TRAMP_VIRT_BEGIN(name) +#else +#define TRAMP_KVM_BEGIN(name) +#endif + +#define EXC_REAL_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) + +#define EXC_VIRT_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size) + +/* * We're short on space and time in the exception prolog, so we can't * use the normal LOAD_REG_IMMEDIATE macro to load the address of label. * Instead we get the base of the kernel from paca->kernelbase and or in the low @@ -68,6 +120,7 @@ addis reg,reg,(ABS_ADDR(label))@h /* Exception register prefixes */ +#define EXC_HV_OR_STD 2 /* depends on HVMODE */ #define EXC_HV 1 #define EXC_STD 0 @@ -127,126 +180,6 @@ BEGIN_FTR_SECTION_NESTED(943) \ std ra,offset(r13); \ END_FTR_SECTION_NESTED(ftr,ftr,943) -.macro EXCEPTION_PROLOG_0 area - SET_SCRATCH0(r13) /* save r13 */ - GET_PACA(r13) - std r9,\area\()+EX_R9(r13) /* save r9 */ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) - HMT_MEDIUM - std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ - OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) -.endm - -.macro EXCEPTION_PROLOG_1 hsrr, area, kvm, vec, dar, dsisr, bitmask - OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) - OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) - INTERRUPT_TO_KERNEL - SAVE_CTR(r10, \area\()) - mfcr r9 - .if \kvm - KVMTEST \hsrr \vec - .endif - .if \bitmask - lbz r10,PACAIRQSOFTMASK(r13) - andi. r10,r10,\bitmask - /* Associate vector numbers with bits in paca->irq_happened */ - .if \vec == 0x500 || \vec == 0xea0 - li r10,PACA_IRQ_EE - .elseif \vec == 0x900 - li r10,PACA_IRQ_DEC - .elseif \vec == 0xa00 || \vec == 0xe80 - li r10,PACA_IRQ_DBELL - .elseif \vec == 0xe60 - li r10,PACA_IRQ_HMI - .elseif \vec == 0xf00 - li r10,PACA_IRQ_PMI - .else - .abort "Bad maskable vector" - .endif - - .if \hsrr - bne masked_Hinterrupt - .else - bne masked_interrupt - .endif - .endif - - std r11,\area\()+EX_R11(r13) - std r12,\area\()+EX_R12(r13) - - /* - * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], - * because a d-side MCE will clobber those registers so is - * not recoverable if they are live. - */ - GET_SCRATCH0(r10) - std r10,\area\()+EX_R13(r13) - .if \dar - mfspr r10,SPRN_DAR - std r10,\area\()+EX_DAR(r13) - .endif - .if \dsisr - mfspr r10,SPRN_DSISR - stw r10,\area\()+EX_DSISR(r13) - .endif -.endm - -.macro EXCEPTION_PROLOG_2_REAL label, hsrr, set_ri - ld r10,PACAKMSR(r13) /* get MSR value for kernel */ - .if ! \set_ri - xori r10,r10,MSR_RI /* Clear MSR_RI */ - .endif - .if \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - .endif - LOAD_HANDLER(r10, \label\()) - .if \hsrr - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - .else - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - .endif - b . /* prevent speculative execution */ -.endm - -.macro EXCEPTION_PROLOG_2_VIRT label, hsrr -#ifdef CONFIG_RELOCATABLE - .if \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - .endif - LOAD_HANDLER(r12, \label\()) - mtctr r12 - .if \hsrr - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - bctr -#else - .if \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - b \label -#endif -.endm - /* * Branch to label using its 0xC000 address. This results in instruction * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned @@ -260,6 +193,11 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr +.macro INT_KVM_HANDLER name, vec, hsrr, area, skip + TRAMP_KVM_BEGIN(\name\()_kvm) + KVM_HANDLER \vec, \hsrr, \area, \skip +.endm + #ifdef CONFIG_KVM_BOOK3S_64_HANDLER #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* @@ -272,17 +210,13 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define kvmppc_interrupt kvmppc_interrupt_pr #endif -.macro KVMTEST hsrr, n +.macro KVMTEST name, hsrr, n lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 - .if \hsrr - bne do_kvm_H\n - .else - bne do_kvm_\n - .endif + bne \name\()_kvm .endm -.macro KVM_HANDLER area, hsrr, n, skip +.macro KVM_HANDLER vec, hsrr, area, skip .if \skip cmpwi r10,KVM_GUEST_MODE_SKIP beq 89f @@ -301,10 +235,16 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r12,HSTATE_SCRATCH0(r13) sldi r12,r9,32 /* HSRR variants have the 0x2 bit added to their trap number */ - .if \hsrr - ori r12,r12,(\n + 0x2) + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + ori r12,r12,(\vec + 0x2) + FTR_SECTION_ELSE + ori r12,r12,(\vec) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + ori r12,r12,(\vec + 0x2) .else - ori r12,r12,(\n) + ori r12,r12,(\vec) .endif #ifdef CONFIG_RELOCATABLE @@ -329,7 +269,13 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) 89: mtocrf 0x80,r9 ld r9,\area+EX_R9(r13) ld r10,\area+EX_R10(r13) - .if \hsrr + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + b kvmppc_skip_Hinterrupt + FTR_SECTION_ELSE + b kvmppc_skip_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr b kvmppc_skip_Hinterrupt .else b kvmppc_skip_interrupt @@ -338,88 +284,328 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #else -.macro KVMTEST hsrr, n +.macro KVMTEST name, hsrr, n .endm -.macro KVM_HANDLER area, hsrr, n, skip +.macro KVM_HANDLER name, vec, hsrr, area, skip .endm #endif -#define EXCEPTION_PROLOG_COMMON_1() \ - std r9,_CCR(r1); /* save CR in stackframe */ \ - std r11,_NIP(r1); /* save SRR0 in stackframe */ \ - std r12,_MSR(r1); /* save SRR1 in stackframe */ \ - std r10,0(r1); /* make stack chain pointer */ \ - std r0,GPR0(r1); /* save r0 in stackframe */ \ - std r10,GPR1(r1); /* save r1 in stackframe */ \ - -/* Save original regs values from save area to stack frame. */ -#define EXCEPTION_PROLOG_COMMON_2(area) \ - ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ - ld r10,area+EX_R10(r13); \ - std r9,GPR9(r1); \ - std r10,GPR10(r1); \ - ld r9,area+EX_R11(r13); /* move r11 - r13 to stackframe */ \ - ld r10,area+EX_R12(r13); \ - ld r11,area+EX_R13(r13); \ - std r9,GPR11(r1); \ - std r10,GPR12(r1); \ - std r11,GPR13(r1); \ -BEGIN_FTR_SECTION_NESTED(66); \ - ld r10,area+EX_CFAR(r13); \ - std r10,ORIG_GPR3(r1); \ -END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ - GET_CTR(r10, area); \ - std r10,_CTR(r1); - -#define EXCEPTION_PROLOG_COMMON_3(trap) \ - std r2,GPR2(r1); /* save r2 in stackframe */ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ - mflr r9; /* Get LR, later save to stack */ \ - ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ - std r9,_LINK(r1); \ - lbz r10,PACAIRQSOFTMASK(r13); \ - mfspr r11,SPRN_XER; /* save XER in stackframe */ \ - std r10,SOFTE(r1); \ - std r11,_XER(r1); \ - li r9,(trap)+1; \ - std r9,_TRAP(r1); /* set trap number */ \ - li r10,0; \ - ld r11,exception_marker@toc(r2); \ - std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ +.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ + .if ! \set_ri + xori r10,r10,MSR_RI /* Clear MSR_RI */ + .endif + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + mtspr SPRN_HSRR1,r10 + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + mtspr SPRN_SRR1,r10 + .endif + LOAD_HANDLER(r10, \label\()) + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + FTR_SECTION_ELSE + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mtspr SPRN_HSRR0,r10 + HRFI_TO_KERNEL + .else + mtspr SPRN_SRR0,r10 + RFI_TO_KERNEL + .endif + b . /* prevent speculative execution */ +.endm + +/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ +.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr +#ifdef CONFIG_RELOCATABLE + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + .endif + LOAD_HANDLER(r12, \label\()) + mtctr r12 + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r12,SPRN_SRR1 /* and HSRR1 */ + .endif + li r10,MSR_RI + mtmsrd r10,1 /* Set RI (EE=0) */ + bctr +#else + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + .endif + li r10,MSR_RI + mtmsrd r10,1 /* Set RI (EE=0) */ + b \label +#endif +.endm + +/* + * This is the BOOK3S interrupt entry code macro. + * + * This can result in one of several things happening: + * - Branch to the _common handler, relocated, in virtual mode. + * These are normal interrupts (synchronous and asynchronous) handled by + * the kernel. + * - Branch to KVM, relocated but real mode interrupts remain in real mode. + * These occur when HSTATE_IN_GUEST is set. The interrupt may be caused by + * / intended for host or guest kernel, but KVM must always be involved + * because the machine state is set for guest execution. + * - Branch to the masked handler, unrelocated. + * These occur when maskable asynchronous interrupts are taken with the + * irq_soft_mask set. + * - Branch to an "early" handler in real mode but relocated. + * This is done if early=1. MCE and HMI use these to handle errors in real + * mode. + * - Fall through and continue executing in real, unrelocated mode. + * This is done if early=2. + */ +.macro INT_HANDLER name, vec, ool=0, early=0, virt=0, hsrr=0, area=PACA_EXGEN, ri=1, dar=0, dsisr=0, bitmask=0, kvm=0 + SET_SCRATCH0(r13) /* save r13 */ + GET_PACA(r13) + std r9,\area\()+EX_R9(r13) /* save r9 */ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + std r10,\area\()+EX_R10(r13) /* save r10 - r12 */ + OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) + .if \ool + .if !\virt + b tramp_real_\name + .pushsection .text + TRAMP_REAL_BEGIN(tramp_real_\name) + .else + b tramp_virt_\name + .pushsection .text + TRAMP_VIRT_BEGIN(tramp_virt_\name) + .endif + .endif + + OPT_SAVE_REG_TO_PACA(\area\()+EX_PPR, r9, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(\area\()+EX_CFAR, r10, CPU_FTR_CFAR) + INTERRUPT_TO_KERNEL + SAVE_CTR(r10, \area\()) + mfcr r9 + .if \kvm + KVMTEST \name \hsrr \vec + .endif + .if \bitmask + lbz r10,PACAIRQSOFTMASK(r13) + andi. r10,r10,\bitmask + /* Associate vector numbers with bits in paca->irq_happened */ + .if \vec == 0x500 || \vec == 0xea0 + li r10,PACA_IRQ_EE + .elseif \vec == 0x900 + li r10,PACA_IRQ_DEC + .elseif \vec == 0xa00 || \vec == 0xe80 + li r10,PACA_IRQ_DBELL + .elseif \vec == 0xe60 + li r10,PACA_IRQ_HMI + .elseif \vec == 0xf00 + li r10,PACA_IRQ_PMI + .else + .abort "Bad maskable vector" + .endif + + .if \hsrr == EXC_HV_OR_STD + BEGIN_FTR_SECTION + bne masked_Hinterrupt + FTR_SECTION_ELSE + bne masked_interrupt + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif \hsrr + bne masked_Hinterrupt + .else + bne masked_interrupt + .endif + .endif + + std r11,\area\()+EX_R11(r13) + std r12,\area\()+EX_R12(r13) + + /* + * DAR/DSISR, SCRATCH0 must be read before setting MSR[RI], + * because a d-side MCE will clobber those registers so is + * not recoverable if they are live. + */ + GET_SCRATCH0(r10) + std r10,\area\()+EX_R13(r13) + .if \dar + .if \hsrr + mfspr r10,SPRN_HDAR + .else + mfspr r10,SPRN_DAR + .endif + std r10,\area\()+EX_DAR(r13) + .endif + .if \dsisr + .if \hsrr + mfspr r10,SPRN_HDSISR + .else + mfspr r10,SPRN_DSISR + .endif + stw r10,\area\()+EX_DSISR(r13) + .endif + + .if \early == 2 + /* nothing more */ + .elseif \early + mfctr r10 /* save ctr, even for !RELOCATABLE */ + BRANCH_TO_C000(r11, \name\()_early_common) + .elseif !\virt + INT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr, \ri + .else + INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, \hsrr + .endif + .if \ool + .popsection + .endif +.endm /* * On entry r13 points to the paca, r9-r13 are saved in the paca, * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and * SRR1, and relocation is on. + * + * If stack=0, then the stack is already set in r1, and r1 is saved in r10. + * PPR save and CPU accounting is not done for the !stack case (XXX why not?) */ -#define EXCEPTION_COMMON(area, trap) \ - andi. r10,r12,MSR_PR; /* See if coming from user */ \ - mr r10,r1; /* Save r1 */ \ - subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ \ - beq- 1f; \ - ld r1,PACAKSAVE(r13); /* kernel stack to use */ \ -1: tdgei r1,-INT_FRAME_SIZE; /* trap if r1 is in userspace */ \ - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; \ -3: EXCEPTION_PROLOG_COMMON_1(); \ - kuap_save_amr_and_lock r9, r10, cr1, cr0; \ - beq 4f; /* if from kernel mode */ \ - ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ - SAVE_PPR(area, r9); \ -4: EXCEPTION_PROLOG_COMMON_2(area); \ - EXCEPTION_PROLOG_COMMON_3(trap); \ +.macro INT_COMMON vec, area, stack, kuap, reconcile, dar, dsisr + .if \stack + andi. r10,r12,MSR_PR /* See if coming from user */ + mr r10,r1 /* Save r1 */ + subi r1,r1,INT_FRAME_SIZE /* alloc frame on kernel stack */ + beq- 100f + ld r1,PACAKSAVE(r13) /* kernel stack to use */ +100: tdgei r1,-INT_FRAME_SIZE /* trap if r1 is in userspace */ + EMIT_BUG_ENTRY 100b,__FILE__,__LINE__,0 + .endif + + std r9,_CCR(r1) /* save CR in stackframe */ + std r11,_NIP(r1) /* save SRR0 in stackframe */ + std r12,_MSR(r1) /* save SRR1 in stackframe */ + std r10,0(r1) /* make stack chain pointer */ + std r0,GPR0(r1) /* save r0 in stackframe */ + std r10,GPR1(r1) /* save r1 in stackframe */ + + .if \stack + .if \kuap + kuap_save_amr_and_lock r9, r10, cr1, cr0 + .endif + beq 101f /* if from kernel mode */ + ACCOUNT_CPU_USER_ENTRY(r13, r9, r10) + SAVE_PPR(\area, r9) +101: + .else + .if \kuap + kuap_save_amr_and_lock r9, r10, cr1 + .endif + .endif + + /* Save original regs values from save area to stack frame. */ + ld r9,\area+EX_R9(r13) /* move r9, r10 to stackframe */ + ld r10,\area+EX_R10(r13) + std r9,GPR9(r1) + std r10,GPR10(r1) + ld r9,\area+EX_R11(r13) /* move r11 - r13 to stackframe */ + ld r10,\area+EX_R12(r13) + ld r11,\area+EX_R13(r13) + std r9,GPR11(r1) + std r10,GPR12(r1) + std r11,GPR13(r1) + .if \dar + .if \dar == 2 + ld r10,_NIP(r1) + .else + ld r10,\area+EX_DAR(r13) + .endif + std r10,_DAR(r1) + .endif + .if \dsisr + .if \dsisr == 2 + ld r10,_MSR(r1) + lis r11,DSISR_SRR1_MATCH_64S@h + and r10,r10,r11 + .else + lwz r10,\area+EX_DSISR(r13) + .endif + std r10,_DSISR(r1) + .endif +BEGIN_FTR_SECTION_NESTED(66) + ld r10,\area+EX_CFAR(r13) + std r10,ORIG_GPR3(r1) +END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) + GET_CTR(r10, \area) + std r10,_CTR(r1) + std r2,GPR2(r1) /* save r2 in stackframe */ + SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ + SAVE_2GPRS(7, r1) /* save r7, r8 in stackframe */ + mflr r9 /* Get LR, later save to stack */ + ld r2,PACATOC(r13) /* get kernel TOC into r2 */ + std r9,_LINK(r1) + lbz r10,PACAIRQSOFTMASK(r13) + mfspr r11,SPRN_XER /* save XER in stackframe */ + std r10,SOFTE(r1) + std r11,_XER(r1) + li r9,(\vec)+1 + std r9,_TRAP(r1) /* set trap number */ + li r10,0 + ld r11,exception_marker@toc(r2) + std r10,RESULT(r1) /* clear regs->result */ + std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ + + .if \stack ACCOUNT_STOLEN_TIME + .endif -/* - * Exception where stack is already set in r1, r1 is saved in r10. - * PPR save and CPU accounting is not done (for some reason). - */ -#define EXCEPTION_COMMON_STACK(area, trap) \ - EXCEPTION_PROLOG_COMMON_1(); \ - kuap_save_amr_and_lock r9, r10, cr1; \ - EXCEPTION_PROLOG_COMMON_2(area); \ - EXCEPTION_PROLOG_COMMON_3(trap) + .if \reconcile + RECONCILE_IRQ_STATE(r10, r11) + .endif +.endm /* * Restore all registers including H/SRR0/1 saved in a stack frame of a @@ -428,6 +614,9 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66); \ .macro EXCEPTION_RESTORE_REGS hsrr /* Move original SRR0 and SRR1 into the respective regs */ ld r9,_MSR(r1) + .if \hsrr == EXC_HV_OR_STD + .error "EXC_HV_OR_STD Not implemented for EXCEPTION_RESTORE_REGS" + .endif .if \hsrr mtspr SPRN_HSRR1,r9 .else @@ -481,219 +670,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) #define FINISH_NAP #endif -/* - * Following are the BOOK3S exception handler helper macros. - * Handlers come in a number of types, and each type has a number of varieties. - * - * EXC_REAL_* - real, unrelocated exception vectors - * EXC_VIRT_* - virt (AIL), unrelocated exception vectors - * TRAMP_REAL_* - real, unrelocated helpers (virt can call these) - * TRAMP_VIRT_* - virt, unreloc helpers (in practice, real can use) - * TRAMP_KVM - KVM handlers that get put into real, unrelocated - * EXC_COMMON - virt, relocated common handlers - * - * The EXC handlers are given a name, and branch to name_common, or the - * appropriate KVM or masking function. Vector handler verieties are as - * follows: - * - * EXC_{REAL|VIRT}_BEGIN/END - used to open-code the exception - * - * EXC_{REAL|VIRT} - standard exception - * - * EXC_{REAL|VIRT}_suffix - * where _suffix is: - * - _MASKABLE - maskable exception - * - _OOL - out of line with trampoline to common handler - * - _HV - HV exception - * - * There can be combinations, e.g., EXC_VIRT_OOL_MASKABLE_HV - * - * KVM handlers come in the following verieties: - * TRAMP_KVM - * TRAMP_KVM_SKIP - * TRAMP_KVM_HV - * TRAMP_KVM_HV_SKIP - * - * COMMON handlers come in the following verieties: - * EXC_COMMON_BEGIN/END - used to open-code the handler - * EXC_COMMON - * EXC_COMMON_ASYNC - * - * TRAMP_REAL and TRAMP_VIRT can be used with BEGIN/END. KVM - * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers. - */ - -#define __EXC_REAL(name, start, size, area) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 area ; \ - EXCEPTION_PROLOG_1 EXC_STD, area, 1, start, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_REAL(name, start, size) \ - __EXC_REAL(name, start, size, PACA_EXGEN) - -#define __EXC_VIRT(name, start, size, realvec, area) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 area ; \ - EXCEPTION_PROLOG_1 EXC_STD, area, 0, realvec, 0, 0, 0; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \ - EXC_VIRT_END(name, start, size) - -#define EXC_VIRT(name, start, size, realvec) \ - __EXC_VIRT(name, start, size, realvec, PACA_EXGEN) - -#define EXC_REAL_MASKABLE(name, start, size, bitmask) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, start, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_VIRT_MASKABLE(name, start, size, realvec, bitmask) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD ; \ - EXC_VIRT_END(name, start, size) - -#define EXC_REAL_HV(name, start, size) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN; \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, start, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 ; \ - EXC_REAL_END(name, start, size) - -#define EXC_VIRT_HV(name, start, size, realvec) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN; \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV ; \ - EXC_VIRT_END(name, start, size) - -#define __EXC_REAL_OOL(name, start, size) \ - EXC_REAL_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - b tramp_real_##name ; \ - EXC_REAL_END(name, start, size) - -#define __TRAMP_REAL_OOL(name, vec) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_REAL_OOL(name, start, size) \ - __EXC_REAL_OOL(name, start, size); \ - __TRAMP_REAL_OOL(name, start) - -#define __EXC_REAL_OOL_MASKABLE(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_MASKABLE(name, vec, bitmask) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_REAL_OOL_MASKABLE(name, start, size, bitmask) \ - __EXC_REAL_OOL_MASKABLE(name, start, size); \ - __TRAMP_REAL_OOL_MASKABLE(name, start, bitmask) - -#define __EXC_REAL_OOL_HV(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_HV(name, vec) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 - -#define EXC_REAL_OOL_HV(name, start, size) \ - __EXC_REAL_OOL_HV(name, start, size); \ - __TRAMP_REAL_OOL_HV(name, start) - -#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size) \ - __EXC_REAL_OOL(name, start, size) - -#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec, bitmask) \ - TRAMP_REAL_BEGIN(tramp_real_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, vec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_HV, 1 - -#define EXC_REAL_OOL_MASKABLE_HV(name, start, size, bitmask) \ - __EXC_REAL_OOL_MASKABLE_HV(name, start, size); \ - __TRAMP_REAL_OOL_MASKABLE_HV(name, start, bitmask) - -#define __EXC_VIRT_OOL(name, start, size) \ - EXC_VIRT_BEGIN(name, start, size); \ - EXCEPTION_PROLOG_0 PACA_EXGEN ; \ - b tramp_virt_##name; \ - EXC_VIRT_END(name, start, size) - -#define __TRAMP_VIRT_OOL(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, vec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_STD - -#define EXC_VIRT_OOL(name, start, size, realvec) \ - __EXC_VIRT_OOL(name, start, size); \ - __TRAMP_VIRT_OOL(name, realvec) - -#define __EXC_VIRT_OOL_MASKABLE(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_REAL name##_common, EXC_STD, 1 - -#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec, bitmask) \ - __EXC_VIRT_OOL_MASKABLE(name, start, size); \ - __TRAMP_VIRT_OOL_MASKABLE(name, realvec, bitmask) - -#define __EXC_VIRT_OOL_HV(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_HV(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, 0 ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV - -#define EXC_VIRT_OOL_HV(name, start, size, realvec) \ - __EXC_VIRT_OOL_HV(name, start, size); \ - __TRAMP_VIRT_OOL_HV(name, realvec) - -#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size) \ - __EXC_VIRT_OOL(name, start, size) - -#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, realvec, 0, 0, bitmask ; \ - EXCEPTION_PROLOG_2_VIRT name##_common, EXC_HV - -#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec, bitmask) \ - __EXC_VIRT_OOL_MASKABLE_HV(name, start, size); \ - __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec, bitmask) - -#define TRAMP_KVM(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_##n); \ - KVM_HANDLER area, EXC_STD, n, 0 - -#define TRAMP_KVM_SKIP(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_##n); \ - KVM_HANDLER area, EXC_STD, n, 1 - -#define TRAMP_KVM_HV(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_H##n); \ - KVM_HANDLER area, EXC_HV, n, 0 - -#define TRAMP_KVM_HV_SKIP(area, n) \ - TRAMP_KVM_BEGIN(do_kvm_H##n); \ - KVM_HANDLER area, EXC_HV, n, 1 - #define EXC_COMMON(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - EXCEPTION_COMMON(PACA_EXGEN, realvec); \ + INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ bl save_nvgprs; \ - RECONCILE_IRQ_STATE(r10, r11); \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ b ret_from_except @@ -704,9 +684,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) */ #define EXC_COMMON_ASYNC(name, realvec, hdlr) \ EXC_COMMON_BEGIN(name); \ - EXCEPTION_COMMON(PACA_EXGEN, realvec); \ + INT_COMMON realvec, PACA_EXGEN, 1, 1, 1, 0, 0 ; \ FINISH_NAP; \ - RECONCILE_IRQ_STATE(r10, r11); \ RUNLATCH_ON; \ addi r3,r1,STACK_FRAME_OVERHEAD; \ bl hdlr; \ @@ -836,9 +815,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif - EXCEPTION_PROLOG_0 PACA_EXNMI - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 1, 0x100, 0, 0, 0 - EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0 + INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0, kvm=1 /* * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is * being used, so a nested NMI exception would corrupt it. @@ -850,9 +827,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) * be dangerous anyway. */ EXC_REAL_END(system_reset, 0x100, 0x100) - EXC_VIRT_NONE(0x4100, 0x100) -TRAMP_KVM(PACA_EXNMI, 0x100) +INT_KVM_HANDLER system_reset 0x100, EXC_STD, PACA_EXNMI, 0 #ifdef CONFIG_PPC_P7_NAP TRAMP_REAL_BEGIN(system_reset_idle_wake) @@ -868,9 +844,7 @@ TRAMP_REAL_BEGIN(system_reset_idle_wake) */ TRAMP_REAL_BEGIN(system_reset_fwnmi) /* See comment at system_reset exception, don't turn on RI */ - EXCEPTION_PROLOG_0 PACA_EXNMI - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXNMI, 0, 0x100, 0, 0, 0 - EXCEPTION_PROLOG_2_REAL system_reset_common, EXC_STD, 0 + INT_HANDLER system_reset, 0x100, area=PACA_EXNMI, ri=0 #endif /* CONFIG_PPC_PSERIES */ @@ -890,7 +864,7 @@ EXC_COMMON_BEGIN(system_reset_common) mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE - EXCEPTION_COMMON_STACK(PACA_EXNMI, 0x100) + INT_COMMON 0x100, PACA_EXNMI, 0, 1, 0, 0, 0 bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does @@ -933,26 +907,39 @@ EXC_COMMON_BEGIN(system_reset_common) EXC_REAL_BEGIN(machine_check, 0x200, 0x100) - /* This is moved out of line as it can be patched by FW, but - * some code path might still want to branch into the original - * vector + INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 + /* + * MSR_RI is not enabled, because PACA_EXMC is being used, so a + * nested machine check corrupts it. machine_check_common enables + * MSR_RI. */ - EXCEPTION_PROLOG_0 PACA_EXMC -BEGIN_FTR_SECTION - b machine_check_common_early -FTR_SECTION_ELSE - b machine_check_pSeries_0 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) EXC_REAL_END(machine_check, 0x200, 0x100) EXC_VIRT_NONE(0x4200, 0x100) -TRAMP_REAL_BEGIN(machine_check_common_early) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 0, 0x200, 0, 0, 0 + +#ifdef CONFIG_PPC_PSERIES +TRAMP_REAL_BEGIN(machine_check_fwnmi) + /* See comment at machine_check exception, don't turn on RI */ + INT_HANDLER machine_check, 0x200, early=1, area=PACA_EXMC, dar=1, dsisr=1 +#endif + +INT_KVM_HANDLER machine_check 0x200, EXC_STD, PACA_EXMC, 1 + +#define MACHINE_CHECK_HANDLER_WINDUP \ + /* Clear MSR_RI before setting SRR0 and SRR1. */\ + li r9,0; \ + mtmsrd r9,1; /* Clear MSR_RI */ \ + /* Decrement paca->in_mce now RI is clear. */ \ + lhz r12,PACA_IN_MCE(r13); \ + subi r12,r12,1; \ + sth r12,PACA_IN_MCE(r13); \ + EXCEPTION_RESTORE_REGS EXC_STD + +EXC_COMMON_BEGIN(machine_check_early_common) + mtctr r10 /* Restore ctr */ + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 + /* - * Register contents: - * R13 = PACA - * R9 = CR - * Original R9 to R13 is saved on PACA_EXMC - * * Switch to mc_emergency stack and handle re-entrancy (we limit * the nested MCE upto level 4 to avoid stack overflow). * Save MCE registers srr1, srr0, dar and dsisr and then set ME=1 @@ -973,103 +960,127 @@ TRAMP_REAL_BEGIN(machine_check_common_early) * the machine check is handled then the idle wakeup code is called * to restore state. */ - mr r11,r1 /* Save r1 */ lhz r10,PACA_IN_MCE(r13) cmpwi r10,0 /* Are we in nested machine check */ - bne 0f /* Yes, we are. */ - /* First machine check entry */ - ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ -0: subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + cmpwi cr1,r10,MAX_MCE_DEPTH /* Are we at maximum nesting */ addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) - /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,MAX_MCE_DEPTH - bgt 2f /* Check if we hit limit of 4 */ - std r11,GPR1(r1) /* Save r1 on the stack. */ - std r11,0(r1) /* make stack chain pointer */ - mfspr r11,SPRN_SRR0 /* Save SRR0 */ - std r11,_NIP(r1) - mfspr r11,SPRN_SRR1 /* Save SRR1 */ - std r11,_MSR(r1) - mfspr r11,SPRN_DAR /* Save DAR */ - std r11,_DAR(r1) - mfspr r11,SPRN_DSISR /* Save DSISR */ - std r11,_DSISR(r1) - std r9,_CCR(r1) /* Save CR in stackframe */ + + mr r10,r1 /* Save r1 */ + bne 1f + /* First machine check entry */ + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ +1: /* Limit nested MCE to level 4 to avoid stack overflow */ + bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + /* We don't touch AMR here, we never go to virtual mode */ - /* Save r9 through r13 from EXMC save area to stack frame. */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) - mfmsr r11 /* get MSR value */ + INT_COMMON 0x200, PACA_EXMC, 0, 0, 0, 1, 1 + BEGIN_FTR_SECTION - ori r11,r11,MSR_ME /* turn on ME bit */ + bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ori r11,r11,MSR_RI /* turn on RI bit */ - LOAD_HANDLER(r12, machine_check_handle_early) -1: mtspr SPRN_SRR0,r12 - mtspr SPRN_SRR1,r11 - RFI_TO_KERNEL - b . /* prevent speculative execution */ -2: - /* Stack overflow. Stay on emergency stack and panic. - * Keep the ME bit off while panic-ing, so that if we hit - * another machine check we checkstop. - */ - addi r1,r1,INT_FRAME_SIZE /* go back to previous stack frame */ - ld r11,PACAKMSR(r13) - LOAD_HANDLER(r12, unrecover_mce) - li r10,MSR_ME - andc r11,r11,r10 /* Turn off MSR_ME */ - b 1b - b . /* prevent speculative execution */ + li r10,MSR_RI + mtmsrd r10,1 -TRAMP_REAL_BEGIN(machine_check_pSeries) - .globl machine_check_fwnmi -machine_check_fwnmi: - EXCEPTION_PROLOG_0 PACA_EXMC + bl save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + bl machine_check_early + std r3,RESULT(r1) /* Save result */ + ld r12,_MSR(r1) + +#ifdef CONFIG_PPC_P7_NAP + /* + * Check if thread was in power saving mode. We come here when any + * of the following is true: + * a. thread wasn't in power saving mode + * b. thread was in power saving mode with no state loss, + * supervisor state loss or hypervisor state loss. + * + * Go back to nap/sleep/winkle mode again if (b) is true. + */ BEGIN_FTR_SECTION - b machine_check_common_early -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) -machine_check_pSeries_0: - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXMC, 1, 0x200, 1, 1, 0 + rlwinm. r11,r12,47-31,30,31 + bne machine_check_idle_common +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +#endif + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* - * MSR_RI is not enabled, because PACA_EXMC is being used, so a - * nested machine check corrupts it. machine_check_common enables - * MSR_RI. + * Check if we are coming from guest. If yes, then run the normal + * exception handler which will take the + * machine_check_kvm->kvmppc_interrupt branch to deliver the MC event + * to guest. + */ + lbz r11,HSTATE_IN_GUEST(r13) + cmpwi r11,0 /* Check if coming from guest */ + bne mce_deliver /* continue if we are. */ +#endif + + /* + * Check if we are coming from userspace. If yes, then run the normal + * exception handler which will deliver the MC event to this kernel. + */ + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne mce_deliver /* continue in V mode if we are. */ + + /* + * At this point we are coming from kernel context. + * Queue up the MCE event and return from the interrupt. + * But before that, check if this is an un-recoverable exception. + * If yes, then stay on emergency stack and panic. */ - EXCEPTION_PROLOG_2_REAL machine_check_common, EXC_STD, 0 + andi. r11,r12,MSR_RI + beq unrecoverable_mce -TRAMP_KVM_SKIP(PACA_EXMC, 0x200) + /* + * Check if we have successfully handled/recovered from error, if not + * then stay on emergency stack and panic. + */ + ld r3,RESULT(r1) /* Load result */ + cmpdi r3,0 /* see if we handled MCE successfully */ + beq unrecoverable_mce /* if !handled then panic */ + + /* + * Return from MC interrupt. + * Queue up the MCE event so that we can log it later, while + * returning from kernel or opal call. + */ + bl machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP + RFI_TO_KERNEL + +mce_deliver: + /* + * This is a host user or guest MCE. Restore all registers, then + * run the "late" handler. For host user, this will run the + * machine_check_exception handler in virtual mode like a normal + * interrupt handler. For guest, this will trigger the KVM test + * and branch to the KVM interrupt similarly to other interrupts. + */ +BEGIN_FTR_SECTION + ld r10,ORIG_GPR3(r1) + mtspr SPRN_CFAR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) + MACHINE_CHECK_HANDLER_WINDUP + /* See comment at machine_check exception, don't turn on RI */ + INT_HANDLER machine_check, 0x200, area=PACA_EXMC, ri=0, dar=1, dsisr=1, kvm=1 EXC_COMMON_BEGIN(machine_check_common) /* * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. */ - EXCEPTION_COMMON(PACA_EXMC, 0x200) + INT_COMMON 0x200, PACA_EXMC, 1, 1, 1, 1, 1 FINISH_NAP - RECONCILE_IRQ_STATE(r10, r11) - ld r3,PACA_EXMC+EX_DAR(r13) - lwz r4,PACA_EXMC+EX_DSISR(r13) /* Enable MSR_RI when finished with PACA_EXMC */ li r10,MSR_RI mtmsrd r10,1 - std r3,_DAR(r1) - std r4,_DSISR(r1) bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception b ret_from_except -#define MACHINE_CHECK_HANDLER_WINDUP \ - /* Clear MSR_RI before setting SRR0 and SRR1. */\ - li r9,0; \ - mtmsrd r9,1; /* Clear MSR_RI */ \ - /* Decrement paca->in_mce now RI is clear. */ \ - lhz r12,PACA_IN_MCE(r13); \ - subi r12,r12,1; \ - sth r12,PACA_IN_MCE(r13); \ - EXCEPTION_RESTORE_REGS EXC_STD - #ifdef CONFIG_PPC_P7_NAP /* * This is an idle wakeup. Low level machine check has already been @@ -1101,72 +1112,8 @@ EXC_COMMON_BEGIN(machine_check_idle_common) bltlr cr1 /* no state loss, return to idle caller */ b idle_return_gpr_loss #endif - /* - * Handle machine check early in real mode. We come here with - * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. - */ -EXC_COMMON_BEGIN(machine_check_handle_early) - std r0,GPR0(r1) /* Save r0 */ - EXCEPTION_PROLOG_COMMON_3(0x200) - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_early - std r3,RESULT(r1) /* Save result */ - ld r12,_MSR(r1) -BEGIN_FTR_SECTION - b 4f -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) - -#ifdef CONFIG_PPC_P7_NAP - /* - * Check if thread was in power saving mode. We come here when any - * of the following is true: - * a. thread wasn't in power saving mode - * b. thread was in power saving mode with no state loss, - * supervisor state loss or hypervisor state loss. - * - * Go back to nap/sleep/winkle mode again if (b) is true. - */ -BEGIN_FTR_SECTION - rlwinm. r11,r12,47-31,30,31 - bne machine_check_idle_common -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) -#endif - /* - * Check if we are coming from hypervisor userspace. If yes then we - * continue in host kernel in V mode to deliver the MC event. - */ - rldicl. r11,r12,4,63 /* See if MC hit while in HV mode. */ - beq 5f -4: andi. r11,r12,MSR_PR /* See if coming from user. */ - bne 9f /* continue in V mode if we are. */ - -5: -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -BEGIN_FTR_SECTION - /* - * We are coming from kernel context. Check if we are coming from - * guest. if yes, then we can continue. We will fall through - * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest. - */ - lbz r11,HSTATE_IN_GUEST(r13) - cmpwi r11,0 /* Check if coming from guest */ - bne 9f /* continue if we are. */ -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) -#endif - /* - * At this point we are not sure about what context we come from. - * Queue up the MCE event and return from the interrupt. - * But before that, check if this is an un-recoverable exception. - * If yes, then stay on emergency stack and panic. - */ - andi. r11,r12,MSR_RI - bne 2f -1: mfspr r11,SPRN_SRR0 - LOAD_HANDLER(r10,unrecover_mce) - mtspr SPRN_SRR0,r10 - ld r10,PACAKMSR(r13) +EXC_COMMON_BEGIN(unrecoverable_mce) /* * We are going down. But there are chances that we might get hit by * another MCE during panic path and we may run into unstable state @@ -1174,84 +1121,36 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) * when another MCE is hit during panic path, system will checkstop * and hypervisor will get restarted cleanly by SP. */ - li r3,MSR_ME - andc r10,r10,r3 /* Turn off MSR_ME */ - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . -2: - /* - * Check if we have successfully handled/recovered from error, if not - * then stay on emergency stack and panic. - */ - ld r3,RESULT(r1) /* Load result */ - cmpdi r3,0 /* see if we handled MCE successfully */ - - beq 1b /* if !handled then panic */ BEGIN_FTR_SECTION - /* - * Return from MC interrupt. - * Queue up the MCE event so that we can log it later, while - * returning from kernel or opal call. - */ - bl machine_check_queue_event - MACHINE_CHECK_HANDLER_WINDUP - RFI_TO_USER_OR_KERNEL -FTR_SECTION_ELSE - /* - * pSeries: Return from MC interrupt. Before that stay on emergency - * stack and call machine_check_exception to log the MCE event. - */ - LOAD_HANDLER(r10,mce_return) - mtspr SPRN_SRR0,r10 + li r10,0 /* clear MSR_RI */ + mtmsrd r10,1 + bl disable_machine_check +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) ld r10,PACAKMSR(r13) - mtspr SPRN_SRR1,r10 - RFI_TO_KERNEL - b . -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -9: - /* Deliver the machine check to host kernel in V mode. */ - MACHINE_CHECK_HANDLER_WINDUP - EXCEPTION_PROLOG_0 PACA_EXMC - b machine_check_pSeries_0 + li r3,MSR_ME + andc r10,r10,r3 + mtmsrd r10 -EXC_COMMON_BEGIN(unrecover_mce) /* Invoke machine_check_exception to print MCE event and panic. */ addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception + /* - * We will not reach here. Even if we did, there is no way out. Call - * unrecoverable_exception and die. + * We will not reach here. Even if we did, there is no way out. + * Call unrecoverable_exception and die. */ -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -EXC_COMMON_BEGIN(mce_return) - /* Invoke machine_check_exception to print MCE event and return. */ addi r3,r1,STACK_FRAME_OVERHEAD - bl machine_check_exception - MACHINE_CHECK_HANDLER_WINDUP - RFI_TO_KERNEL + bl unrecoverable_exception b . + EXC_REAL_BEGIN(data_access, 0x300, 0x80) - EXCEPTION_PROLOG_0 PACA_EXGEN - b tramp_real_data_access + INT_HANDLER data_access, 0x300, ool=1, dar=1, dsisr=1, kvm=1 EXC_REAL_END(data_access, 0x300, 0x80) - -TRAMP_REAL_BEGIN(tramp_real_data_access) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x300, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL data_access_common, EXC_STD, 1 - EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x300, 1, 1, 0 -EXCEPTION_PROLOG_2_VIRT data_access_common, EXC_STD + INT_HANDLER data_access, 0x300, virt=1, dar=1, dsisr=1 EXC_VIRT_END(data_access, 0x4300, 0x80) - -TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) - +INT_KVM_HANDLER data_access, 0x300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON_BEGIN(data_access_common) /* * Here r13 points to the paca, r9 contains the saved CR, @@ -1259,15 +1158,12 @@ EXC_COMMON_BEGIN(data_access_common) * r9 - r13 are saved in paca->exgen. * EX_DAR and EX_DSISR have saved DAR/DSISR */ - EXCEPTION_COMMON(PACA_EXGEN, 0x300) - RECONCILE_IRQ_STATE(r10, r11) - ld r12,_MSR(r1) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - li r5,0x300 - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x300, PACA_EXGEN, 1, 1, 1, 1, 1 + ld r4,_DAR(r1) + ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION + ld r6,_MSR(r1) + li r3,0x300 b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault @@ -1275,26 +1171,15 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) - EXCEPTION_PROLOG_0 PACA_EXSLB - b tramp_real_data_access_slb + INT_HANDLER data_access_slb, 0x380, ool=1, area=PACA_EXSLB, dar=1, kvm=1 EXC_REAL_END(data_access_slb, 0x380, 0x80) - -TRAMP_REAL_BEGIN(tramp_real_data_access_slb) - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 1, 0x380, 1, 0, 0 - EXCEPTION_PROLOG_2_REAL data_access_slb_common, EXC_STD, 1 - EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) - EXCEPTION_PROLOG_0 PACA_EXSLB - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXSLB, 0, 0x380, 1, 0, 0 - EXCEPTION_PROLOG_2_VIRT data_access_slb_common, EXC_STD + INT_HANDLER data_access_slb, 0x380, virt=1, area=PACA_EXSLB, dar=1 EXC_VIRT_END(data_access_slb, 0x4380, 0x80) - -TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) - +INT_KVM_HANDLER data_access_slb, 0x380, EXC_STD, PACA_EXSLB, 1 EXC_COMMON_BEGIN(data_access_slb_common) - EXCEPTION_COMMON(PACA_EXSLB, 0x380) - ld r4,PACA_EXSLB+EX_DAR(r13) - std r4,_DAR(r1) + INT_COMMON 0x380, PACA_EXSLB, 1, 1, 0, 1, 0 + ld r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ @@ -1317,33 +1202,36 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) b ret_from_except -EXC_REAL(instruction_access, 0x400, 0x80) -EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) -TRAMP_KVM(PACA_EXGEN, 0x400) - +EXC_REAL_BEGIN(instruction_access, 0x400, 0x80) + INT_HANDLER instruction_access, 0x400, kvm=1 +EXC_REAL_END(instruction_access, 0x400, 0x80) +EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) + INT_HANDLER instruction_access, 0x400, virt=1 +EXC_VIRT_END(instruction_access, 0x4400, 0x80) +INT_KVM_HANDLER instruction_access, 0x400, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(instruction_access_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x400) - RECONCILE_IRQ_STATE(r10, r11) - ld r12,_MSR(r1) - ld r3,_NIP(r1) - andis. r4,r12,DSISR_SRR1_MATCH_64S@h - li r5,0x400 - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x400, PACA_EXGEN, 1, 1, 1, 2, 2 + ld r4,_DAR(r1) + ld r5,_DSISR(r1) BEGIN_MMU_FTR_SECTION + ld r6,_MSR(r1) + li r3,0x400 b do_hash_page /* Try to handle as hpte fault */ MMU_FTR_SECTION_ELSE b handle_page_fault ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) -__EXC_REAL(instruction_access_slb, 0x480, 0x80, PACA_EXSLB) -__EXC_VIRT(instruction_access_slb, 0x4480, 0x80, 0x480, PACA_EXSLB) -TRAMP_KVM(PACA_EXSLB, 0x480) - +EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) + INT_HANDLER instruction_access_slb, 0x480, area=PACA_EXSLB, kvm=1 +EXC_REAL_END(instruction_access_slb, 0x480, 0x80) +EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) + INT_HANDLER instruction_access_slb, 0x480, virt=1, area=PACA_EXSLB +EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) +INT_KVM_HANDLER instruction_access_slb, 0x480, EXC_STD, PACA_EXSLB, 0 EXC_COMMON_BEGIN(instruction_access_slb_common) - EXCEPTION_COMMON(PACA_EXSLB, 0x480) - ld r4,_NIP(r1) + INT_COMMON 0x480, PACA_EXSLB, 1, 1, 0, 2, 0 + ld r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ @@ -1359,69 +1247,44 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) - ld r4,_NIP(r1) + ld r4,_DAR(r1) ld r5,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_slb_fault b ret_from_except - EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_HV, 1 -FTR_SECTION_ELSE - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hardware_interrupt_common, EXC_STD, 1 -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + INT_HANDLER hardware_interrupt, 0x500, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_REAL_END(hardware_interrupt, 0x500, 0x100) - EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN -BEGIN_FTR_SECTION - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_HV -FTR_SECTION_ELSE - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x500, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_VIRT hardware_interrupt_common, EXC_STD -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + INT_HANDLER hardware_interrupt, 0x500, virt=1, hsrr=EXC_HV_OR_STD, bitmask=IRQS_DISABLED, kvm=1 EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) - -TRAMP_KVM(PACA_EXGEN, 0x500) -TRAMP_KVM_HV(PACA_EXGEN, 0x500) +INT_KVM_HANDLER hardware_interrupt, 0x500, EXC_HV_OR_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) EXC_REAL_BEGIN(alignment, 0x600, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 1, 0x600, 1, 1, 0 - EXCEPTION_PROLOG_2_REAL alignment_common, EXC_STD, 1 + INT_HANDLER alignment, 0x600, dar=1, dsisr=1, kvm=1 EXC_REAL_END(alignment, 0x600, 0x100) - EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_STD, PACA_EXGEN, 0, 0x600, 1, 1, 0 - EXCEPTION_PROLOG_2_VIRT alignment_common, EXC_STD + INT_HANDLER alignment, 0x600, virt=1, dar=1, dsisr=1 EXC_VIRT_END(alignment, 0x4600, 0x100) - -TRAMP_KVM(PACA_EXGEN, 0x600) +INT_KVM_HANDLER alignment, 0x600, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(alignment_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x600) - ld r3,PACA_EXGEN+EX_DAR(r13) - lwz r4,PACA_EXGEN+EX_DSISR(r13) - std r3,_DAR(r1) - std r4,_DSISR(r1) + INT_COMMON 0x600, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl alignment_exception b ret_from_except -EXC_REAL(program_check, 0x700, 0x100) -EXC_VIRT(program_check, 0x4700, 0x100, 0x700) -TRAMP_KVM(PACA_EXGEN, 0x700) +EXC_REAL_BEGIN(program_check, 0x700, 0x100) + INT_HANDLER program_check, 0x700, kvm=1 +EXC_REAL_END(program_check, 0x700, 0x100) +EXC_VIRT_BEGIN(program_check, 0x4700, 0x100) + INT_HANDLER program_check, 0x700, virt=1 +EXC_VIRT_END(program_check, 0x4700, 0x100) +INT_KVM_HANDLER program_check, 0x700, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(program_check_common) /* * It's possible to receive a TM Bad Thing type program check with @@ -1447,27 +1310,33 @@ EXC_COMMON_BEGIN(program_check_common) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - b 3f /* Jump into the macro !! */ + INT_COMMON 0x700, PACA_EXGEN, 0, 1, 1, 0, 0 + b 3f 2: - EXCEPTION_COMMON(PACA_EXGEN, 0x700) + INT_COMMON 0x700, PACA_EXGEN, 1, 1, 1, 0, 0 +3: bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception b ret_from_except -EXC_REAL(fp_unavailable, 0x800, 0x100) -EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800) -TRAMP_KVM(PACA_EXGEN, 0x800) +EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) + INT_HANDLER fp_unavailable, 0x800, kvm=1 +EXC_REAL_END(fp_unavailable, 0x800, 0x100) +EXC_VIRT_BEGIN(fp_unavailable, 0x4800, 0x100) + INT_HANDLER fp_unavailable, 0x800, virt=1 +EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) +INT_KVM_HANDLER fp_unavailable, 0x800, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(fp_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0x800) + INT_COMMON 0x800, PACA_EXGEN, 1, 1, 0, 0, 0 bne 1f /* if from user, just load it up */ bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_fp_unavailable_exception - BUG_OPCODE +0: trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 1: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION @@ -1490,21 +1359,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif -EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80, IRQS_DISABLED) -EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900, IRQS_DISABLED) -TRAMP_KVM(PACA_EXGEN, 0x900) +EXC_REAL_BEGIN(decrementer, 0x900, 0x80) + INT_HANDLER decrementer, 0x900, ool=1, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(decrementer, 0x900, 0x80) +EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) + INT_HANDLER decrementer, 0x900, virt=1, bitmask=IRQS_DISABLED +EXC_VIRT_END(decrementer, 0x4900, 0x80) +INT_KVM_HANDLER decrementer, 0x900, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) -EXC_REAL_HV(hdecrementer, 0x980, 0x80) -EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980) -TRAMP_KVM_HV(PACA_EXGEN, 0x980) +EXC_REAL_BEGIN(hdecrementer, 0x980, 0x80) + INT_HANDLER hdecrementer, 0x980, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(hdecrementer, 0x980, 0x80) +EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) + INT_HANDLER hdecrementer, 0x980, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(hdecrementer, 0x4980, 0x80) +INT_KVM_HANDLER hdecrementer, 0x980, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) -EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100, IRQS_DISABLED) -EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00, IRQS_DISABLED) -TRAMP_KVM(PACA_EXGEN, 0xa00) +EXC_REAL_BEGIN(doorbell_super, 0xa00, 0x100) + INT_HANDLER doorbell_super, 0xa00, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(doorbell_super, 0xa00, 0x100) +EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) + INT_HANDLER doorbell_super, 0xa00, virt=1, bitmask=IRQS_DISABLED +EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) +INT_KVM_HANDLER doorbell_super, 0xa00, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) #else @@ -1512,17 +1393,13 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception) #endif -EXC_REAL(trap_0b, 0xb00, 0x100) -EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) -TRAMP_KVM(PACA_EXGEN, 0xb00) -EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +EXC_REAL_NONE(0xb00, 0x100) +EXC_VIRT_NONE(0x4b00, 0x100) /* * system call / hypercall (0xc00, 0x4c00) * * The system call exception is invoked with "sc 0" and does not alter HV bit. - * There is support for kernel code to invoke system calls but there are no - * in-tree users. * * The hypercall is invoked with "sc 1" and sets HV=1. * @@ -1531,22 +1408,9 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * * Call convention: * - * syscall register convention is in Documentation/powerpc/syscall64-abi.rst - * - * For hypercalls, the register convention is as follows: - * r0 volatile - * r1-2 nonvolatile - * r3 volatile parameter and return value for status - * r4-r10 volatile input and output value - * r11 volatile hypercall number and output value - * r12 volatile input and output value - * r13-r31 nonvolatile - * LR nonvolatile - * CTR volatile - * XER volatile - * CR0-1 CR5-7 volatile - * CR2-4 nonvolatile - * Other registers nonvolatile + * syscall and hypercalls register conventions are documented in + * Documentation/powerpc/syscall64-abi.rst and + * Documentation/powerpc/papr_hcalls.rst respectively. * * The intersection of volatile registers that don't contain possible * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry @@ -1567,7 +1431,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) GET_PACA(r13) std r10,PACA_EXGEN+EX_R10(r13) INTERRUPT_TO_KERNEL - KVMTEST EXC_STD 0xc00 /* uses r10, branch to do_kvm_0xc00_system_call */ + KVMTEST system_call EXC_STD 0xc00 /* uses r10, branch to system_call_kvm */ mfctr r9 #else mr r9,r13 @@ -1621,7 +1485,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) EXC_REAL_BEGIN(system_call, 0xc00, 0x100) SYSTEM_CALL 0 EXC_REAL_END(system_call, 0xc00, 0x100) - EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) SYSTEM_CALL 1 EXC_VIRT_END(system_call, 0x4c00, 0x100) @@ -1634,7 +1497,7 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) * ctr = orig r13 * orig r10 saved in PACA */ -TRAMP_KVM_BEGIN(do_kvm_0xc00) +TRAMP_KVM_BEGIN(system_call_kvm) /* * Save the PPR (on systems that support it) before changing to * HMT_MEDIUM. That allows the KVM code to save that value into the @@ -1647,32 +1510,33 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) SET_SCRATCH0(r10) std r9,PACA_EXGEN+EX_R9(r13) mfcr r9 - KVM_HANDLER PACA_EXGEN, EXC_STD, 0xc00, 0 + KVM_HANDLER 0xc00, EXC_STD, PACA_EXGEN, 0 #endif -EXC_REAL(single_step, 0xd00, 0x100) -EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00) -TRAMP_KVM(PACA_EXGEN, 0xd00) +EXC_REAL_BEGIN(single_step, 0xd00, 0x100) + INT_HANDLER single_step, 0xd00, kvm=1 +EXC_REAL_END(single_step, 0xd00, 0x100) +EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) + INT_HANDLER single_step, 0xd00, virt=1 +EXC_VIRT_END(single_step, 0x4d00, 0x100) +INT_KVM_HANDLER single_step, 0xd00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(single_step_common, 0xd00, single_step_exception) -EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20) -EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) + +EXC_REAL_BEGIN(h_data_storage, 0xe00, 0x20) + INT_HANDLER h_data_storage, 0xe00, ool=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 +EXC_REAL_END(h_data_storage, 0xe00, 0x20) +EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) + INT_HANDLER h_data_storage, 0xe00, ool=1, virt=1, hsrr=EXC_HV, dar=1, dsisr=1, kvm=1 +EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) +INT_KVM_HANDLER h_data_storage, 0xe00, EXC_HV, PACA_EXGEN, 1 EXC_COMMON_BEGIN(h_data_storage_common) - mfspr r10,SPRN_HDAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_HDSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) - EXCEPTION_COMMON(PACA_EXGEN, 0xe00) + INT_COMMON 0xe00, PACA_EXGEN, 1, 1, 1, 1, 1 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION - ld r4,PACA_EXGEN+EX_DAR(r13) - lwz r5,PACA_EXGEN+EX_DSISR(r13) - std r4,_DAR(r1) - std r5,_DSISR(r1) + ld r4,_DAR(r1) li r5,SIGSEGV bl bad_page_fault MMU_FTR_SECTION_ELSE @@ -1681,15 +1545,23 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) b ret_from_except -EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20) -EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe20) +EXC_REAL_BEGIN(h_instr_storage, 0xe20, 0x20) + INT_HANDLER h_instr_storage, 0xe20, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(h_instr_storage, 0xe20, 0x20) +EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) + INT_HANDLER h_instr_storage, 0xe20, ool=1, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) +INT_KVM_HANDLER h_instr_storage, 0xe20, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) -EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20) -EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40) -TRAMP_KVM_HV(PACA_EXGEN, 0xe40) +EXC_REAL_BEGIN(emulation_assist, 0xe40, 0x20) + INT_HANDLER emulation_assist, 0xe40, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(emulation_assist, 0xe40, 0x20) +EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) + INT_HANDLER emulation_assist, 0xe40, ool=1, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) +INT_KVM_HANDLER emulation_assist, 0xe40, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -1699,16 +1571,10 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) * mode. */ EXC_REAL_BEGIN(hmi_exception, 0xe60, 0x20) - EXCEPTION_PROLOG_0 PACA_EXGEN - b hmi_exception_early + INT_HANDLER hmi_exception, 0xe60, ool=1, early=1, hsrr=EXC_HV, ri=0, kvm=1 EXC_REAL_END(hmi_exception, 0xe60, 0x20) EXC_VIRT_NONE(0x4e60, 0x20) -TRAMP_KVM_HV(PACA_EXGEN, 0xe60) -TRAMP_REAL_BEGIN(hmi_exception_early) - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, 0 - mfctr r10 /* save ctr, even for !RELOCATABLE */ - BRANCH_TO_C000(r11, hmi_exception_early_common) - +INT_KVM_HANDLER hmi_exception, 0xe60, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_BEGIN(hmi_exception_early_common) mtctr r10 /* Restore ctr */ mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ @@ -1716,10 +1582,10 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - EXCEPTION_PROLOG_COMMON_1() + /* We don't touch AMR here, we never go to virtual mode */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) - EXCEPTION_PROLOG_COMMON_3(0xe60) + INT_COMMON 0xe60, PACA_EXGEN, 0, 0, 0, 0, 0 + addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode cmpdi cr0,r3,0 @@ -1734,23 +1600,25 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) * firmware. */ EXCEPTION_RESTORE_REGS EXC_HV - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 1, 0xe60, 0, 0, IRQS_DISABLED - EXCEPTION_PROLOG_2_REAL hmi_exception_common, EXC_HV, 1 + INT_HANDLER hmi_exception, 0xe60, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 EXC_COMMON_BEGIN(hmi_exception_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xe60) + INT_COMMON 0xe60, PACA_EXGEN, 1, 1, 1, 0, 0 FINISH_NAP - bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) RUNLATCH_ON + bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl handle_hmi_exception b ret_from_except -EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20, IRQS_DISABLED) -EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80, IRQS_DISABLED) -TRAMP_KVM_HV(PACA_EXGEN, 0xe80) + +EXC_REAL_BEGIN(h_doorbell, 0xe80, 0x20) + INT_HANDLER h_doorbell, 0xe80, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(h_doorbell, 0xe80, 0x20) +EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) + INT_HANDLER h_doorbell, 0xe80, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) +INT_KVM_HANDLER h_doorbell, 0xe80, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) #else @@ -1758,9 +1626,13 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception) #endif -EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20, IRQS_DISABLED) -EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0, IRQS_DISABLED) -TRAMP_KVM_HV(PACA_EXGEN, 0xea0) +EXC_REAL_BEGIN(h_virt_irq, 0xea0, 0x20) + INT_HANDLER h_virt_irq, 0xea0, ool=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_REAL_END(h_virt_irq, 0xea0, 0x20) +EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) + INT_HANDLER h_virt_irq, 0xea0, ool=1, virt=1, hsrr=EXC_HV, bitmask=IRQS_DISABLED, kvm=1 +EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) +INT_KVM_HANDLER h_virt_irq, 0xea0, EXC_HV, PACA_EXGEN, 0 EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) @@ -1770,17 +1642,25 @@ EXC_REAL_NONE(0xee0, 0x20) EXC_VIRT_NONE(0x4ee0, 0x20) -EXC_REAL_OOL_MASKABLE(performance_monitor, 0xf00, 0x20, IRQS_PMI_DISABLED) -EXC_VIRT_OOL_MASKABLE(performance_monitor, 0x4f00, 0x20, 0xf00, IRQS_PMI_DISABLED) -TRAMP_KVM(PACA_EXGEN, 0xf00) +EXC_REAL_BEGIN(performance_monitor, 0xf00, 0x20) + INT_HANDLER performance_monitor, 0xf00, ool=1, bitmask=IRQS_PMI_DISABLED, kvm=1 +EXC_REAL_END(performance_monitor, 0xf00, 0x20) +EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) + INT_HANDLER performance_monitor, 0xf00, ool=1, virt=1, bitmask=IRQS_PMI_DISABLED +EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) +INT_KVM_HANDLER performance_monitor, 0xf00, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) -EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20) -EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20) -TRAMP_KVM(PACA_EXGEN, 0xf20) +EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20) + INT_HANDLER altivec_unavailable, 0xf20, ool=1, kvm=1 +EXC_REAL_END(altivec_unavailable, 0xf20, 0x20) +EXC_VIRT_BEGIN(altivec_unavailable, 0x4f20, 0x20) + INT_HANDLER altivec_unavailable, 0xf20, ool=1, virt=1 +EXC_VIRT_END(altivec_unavailable, 0x4f20, 0x20) +INT_KVM_HANDLER altivec_unavailable, 0xf20, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(altivec_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xf20) + INT_COMMON 0xf20, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION beq 1f @@ -1813,11 +1693,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) b ret_from_except -EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20) -EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40) -TRAMP_KVM(PACA_EXGEN, 0xf40) +EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20) + INT_HANDLER vsx_unavailable, 0xf40, ool=1, kvm=1 +EXC_REAL_END(vsx_unavailable, 0xf40, 0x20) +EXC_VIRT_BEGIN(vsx_unavailable, 0x4f40, 0x20) + INT_HANDLER vsx_unavailable, 0xf40, ool=1, virt=1 +EXC_VIRT_END(vsx_unavailable, 0x4f40, 0x20) +INT_KVM_HANDLER vsx_unavailable, 0xf40, EXC_STD, PACA_EXGEN, 0 EXC_COMMON_BEGIN(vsx_unavailable_common) - EXCEPTION_COMMON(PACA_EXGEN, 0xf40) + INT_COMMON 0xf40, PACA_EXGEN, 1, 1, 0, 0, 0 #ifdef CONFIG_VSX BEGIN_FTR_SECTION beq 1f @@ -1849,15 +1733,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) b ret_from_except -EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20) -EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60) -TRAMP_KVM(PACA_EXGEN, 0xf60) +EXC_REAL_BEGIN(facility_unavailable, 0xf60, 0x20) + INT_HANDLER facility_unavailable, 0xf60, ool=1, kvm=1 +EXC_REAL_END(facility_unavailable, 0xf60, 0x20) +EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) + INT_HANDLER facility_unavailable, 0xf60, ool=1, virt=1 +EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) +INT_KVM_HANDLER facility_unavailable, 0xf60, EXC_STD, PACA_EXGEN, 0 EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) -EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20) -EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80) -TRAMP_KVM_HV(PACA_EXGEN, 0xf80) +EXC_REAL_BEGIN(h_facility_unavailable, 0xf80, 0x20) + INT_HANDLER h_facility_unavailable, 0xf80, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(h_facility_unavailable, 0xf80, 0x20) +EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) + INT_HANDLER h_facility_unavailable, 0xf80, ool=1, virt=1, hsrr=EXC_HV, kvm=1 +EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) +INT_KVM_HANDLER h_facility_unavailable, 0xf80, EXC_HV, PACA_EXGEN, 0 EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) @@ -1874,9 +1766,11 @@ EXC_REAL_NONE(0x1100, 0x100) EXC_VIRT_NONE(0x5100, 0x100) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_system_error, 0x1200, 0x100) +EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100) + INT_HANDLER cbe_system_error, 0x1200, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) +INT_KVM_HANDLER cbe_system_error, 0x1200, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1200, 0x100) @@ -1884,37 +1778,43 @@ EXC_VIRT_NONE(0x5200, 0x100) #endif -EXC_REAL(instruction_breakpoint, 0x1300, 0x100) -EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300) -TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) +EXC_REAL_BEGIN(instruction_breakpoint, 0x1300, 0x100) + INT_HANDLER instruction_breakpoint, 0x1300, kvm=1 +EXC_REAL_END(instruction_breakpoint, 0x1300, 0x100) +EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) + INT_HANDLER instruction_breakpoint, 0x1300, virt=1 +EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) +INT_KVM_HANDLER instruction_breakpoint, 0x1300, EXC_STD, PACA_EXGEN, 1 EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) + EXC_REAL_NONE(0x1400, 0x100) EXC_VIRT_NONE(0x5400, 0x100) EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) - EXCEPTION_PROLOG_0 PACA_EXGEN - EXCEPTION_PROLOG_1 EXC_HV, PACA_EXGEN, 0, 0x1500, 0, 0, 0 - + INT_HANDLER denorm_exception_hv, 0x1500, early=2, hsrr=EXC_HV #ifdef CONFIG_PPC_DENORMALISATION mfspr r10,SPRN_HSRR1 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist #endif - - KVMTEST EXC_HV 0x1500 - EXCEPTION_PROLOG_2_REAL denorm_common, EXC_HV, 1 + KVMTEST denorm_exception_hv, EXC_HV 0x1500 + INT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV, 1 EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) - b exc_real_0x1500_denorm_exception_hv + INT_HANDLER denorm_exception, 0x1500, 0, 2, 1, EXC_HV, PACA_EXGEN, 1, 0, 0, 0, 0 + mfspr r10,SPRN_HSRR1 + andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ + bne+ denorm_assist + INT_VIRT_SAVE_SRR_AND_JUMP denorm_common, EXC_HV EXC_VIRT_END(denorm_exception, 0x5500, 0x100) #else EXC_VIRT_NONE(0x5500, 0x100) #endif -TRAMP_KVM_HV(PACA_EXGEN, 0x1500) +INT_KVM_HANDLER denorm_exception_hv, 0x1500, EXC_HV, PACA_EXGEN, 0 #ifdef CONFIG_PPC_DENORMALISATION TRAMP_REAL_BEGIN(denorm_assist) @@ -1989,9 +1889,11 @@ EXC_COMMON(denorm_common, 0x1500, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100) +EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100) + INT_HANDLER cbe_maintenance, 0x1600, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) +INT_KVM_HANDLER cbe_maintenance, 0x1600, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1600, 0x100) @@ -1999,9 +1901,13 @@ EXC_VIRT_NONE(0x5600, 0x100) #endif -EXC_REAL(altivec_assist, 0x1700, 0x100) -EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700) -TRAMP_KVM(PACA_EXGEN, 0x1700) +EXC_REAL_BEGIN(altivec_assist, 0x1700, 0x100) + INT_HANDLER altivec_assist, 0x1700, kvm=1 +EXC_REAL_END(altivec_assist, 0x1700, 0x100) +EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) + INT_HANDLER altivec_assist, 0x1700, virt=1 +EXC_VIRT_END(altivec_assist, 0x5700, 0x100) +INT_KVM_HANDLER altivec_assist, 0x1700, EXC_STD, PACA_EXGEN, 0 #ifdef CONFIG_ALTIVEC EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) #else @@ -2010,15 +1916,18 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_thermal, 0x1800, 0x100) +EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100) + INT_HANDLER cbe_thermal, 0x1800, ool=1, hsrr=EXC_HV, kvm=1 +EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) -TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) +INT_KVM_HANDLER cbe_thermal, 0x1800, EXC_HV, PACA_EXGEN, 1 EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) #else /* CONFIG_CBE_RAS */ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif + #ifdef CONFIG_PPC_WATCHDOG #define MASKED_DEC_HANDLER_LABEL 3f @@ -2028,7 +1937,7 @@ EXC_VIRT_NONE(0x5800, 0x100) std r12,PACA_EXGEN+EX_R12(r13); \ GET_SCRATCH0(r10); \ std r10,PACA_EXGEN+EX_R13(r13); \ - EXCEPTION_PROLOG_2_REAL soft_nmi_common, _H, 1 + INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1 /* * Branch to soft_nmi_interrupt using the emergency stack. The emergency @@ -2043,9 +1952,8 @@ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE - EXCEPTION_COMMON_STACK(PACA_EXGEN, 0x900) + INT_COMMON 0x900, PACA_EXGEN, 0, 1, 1, 0, 0 bl save_nvgprs - RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt b ret_from_except @@ -2287,11 +2195,20 @@ __end_interrupts: DEFINE_FIXED_SYMBOL(__end_interrupts) #ifdef CONFIG_PPC_970_NAP + /* + * Called by exception entry code if _TLF_NAPPING was set, this clears + * the NAPPING flag, and redirects the exception exit to + * power4_fixup_nap_return. + */ + .globl power4_fixup_nap EXC_COMMON_BEGIN(power4_fixup_nap) andc r9,r9,r10 std r9,TI_LOCAL_FLAGS(r11) - ld r10,_LINK(r1) /* make idle task do the */ - std r10,_NIP(r1) /* equivalent of a blr */ + LOAD_REG_ADDR(r10, power4_idle_nap_return) + std r10,_NIP(r1) + blr + +power4_idle_nap_return: blr #endif @@ -2302,6 +2219,35 @@ CLOSE_FIXED_SECTION(virt_trampolines); USE_TEXT_SECTION() +/* MSR[RI] should be clear because this uses SRR[01] */ +enable_machine_check: + mflr r0 + bcl 20,31,$+4 +0: mflr r3 + addi r3,r3,(1f - 0b) + mtspr SPRN_SRR0,r3 + mfmsr r3 + ori r3,r3,MSR_ME + mtspr SPRN_SRR1,r3 + RFI_TO_KERNEL +1: mtlr r0 + blr + +/* MSR[RI] should be clear because this uses SRR[01] */ +disable_machine_check: + mflr r0 + bcl 20,31,$+4 +0: mflr r3 + addi r3,r3,(1f - 0b) + mtspr SPRN_SRR0,r3 + mfmsr r3 + li r4,MSR_ME + andc r3,r3,r4 + mtspr SPRN_SRR1,r3 + RFI_TO_KERNEL +1: mtlr r0 + blr + /* * Hash table stuff */ @@ -2310,7 +2256,7 @@ do_hash_page: #ifdef CONFIG_PPC_BOOK3S_64 lis r0,(DSISR_BAD_FAULT_64S | DSISR_DABRMATCH | DSISR_KEYFAULT)@h ori r0,r0,DSISR_BAD_FAULT_64S@l - and. r0,r4,r0 /* weird error? */ + and. r0,r5,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ ld r11, PACA_THREAD_INFO(r13) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ @@ -2318,15 +2264,13 @@ do_hash_page: bne 77f /* then don't call hash_page now */ /* - * r3 contains the faulting address - * r4 msr - * r5 contains the trap number - * r6 contains dsisr + * r3 contains the trap number + * r4 contains the faulting address + * r5 contains dsisr + * r6 msr * * at return r3 = 0 for success, 1 for page fault, negative for error */ - mr r4,r12 - ld r6,_DSISR(r1) bl __hash_page /* build HPTE if possible */ cmpdi r3,0 /* see if __hash_page succeeded */ @@ -2336,16 +2280,15 @@ do_hash_page: /* Error */ blt- 13f - /* Reload DSISR into r4 for the DABR check below */ - ld r4,_DSISR(r1) + /* Reload DAR/DSISR into r4/r5 for the DABR check below */ + ld r4,_DAR(r1) + ld r5,_DSISR(r1) #endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: -11: andis. r0,r4,DSISR_DABRMATCH@h +11: andis. r0,r5,DSISR_DABRMATCH@h bne- handle_dabr_fault - ld r4,_DAR(r1) - ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault cmpdi r3,0 @@ -2353,7 +2296,7 @@ handle_page_fault: bl save_nvgprs mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD - lwz r4,_DAR(r1) + ld r4,_DAR(r1) bl bad_page_fault b ret_from_except @@ -2392,7 +2335,6 @@ handle_dabr_fault: * the access, or panic if there isn't a handler. */ 77: bl save_nvgprs - mr r4,r3 addi r3,r1,STACK_FRAME_OVERHEAD li r5,SIGSEGV bl bad_page_fault diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4eab97292cc2..ff0114aeba9b 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -28,24 +28,22 @@ #include <asm/debugfs.h> #include <asm/page.h> #include <asm/prom.h> -#include <asm/rtas.h> #include <asm/fadump.h> +#include <asm/fadump-internal.h> #include <asm/setup.h> static struct fw_dump fw_dump; -static struct fadump_mem_struct fdm; -static const struct fadump_mem_struct *fdm_active; -#ifdef CONFIG_CMA -static struct cma *fadump_cma; -#endif +static void __init fadump_reserve_crash_area(u64 base); + +#ifndef CONFIG_PRESERVE_FA_DUMP static DEFINE_MUTEX(fadump_mutex); -struct fad_crash_memory_ranges *crash_memory_ranges; -int crash_memory_ranges_size; -int crash_mem_ranges; -int max_crash_mem_ranges; +struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0 }; +struct fadump_mrange_info reserved_mrange_info = { "reserved", NULL, 0, 0, 0 }; #ifdef CONFIG_CMA +static struct cma *fadump_cma; + /* * fadump_cma_init() - Initialize CMA area from a fadump reserved memory * @@ -107,84 +105,45 @@ static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ /* Scan the Firmware Assisted dump configuration details. */ -int __init early_init_dt_scan_fw_dump(unsigned long node, - const char *uname, int depth, void *data) +int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data) { - const __be32 *sections; - int i, num_sections; - int size; - const __be32 *token; - - if (depth != 1 || strcmp(uname, "rtas") != 0) + if (depth != 1) return 0; - /* - * Check if Firmware Assisted dump is supported. if yes, check - * if dump has been initiated on last reboot. - */ - token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); - if (!token) + if (strcmp(uname, "rtas") == 0) { + rtas_fadump_dt_scan(&fw_dump, node); return 1; + } - fw_dump.fadump_supported = 1; - fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token); - - /* - * The 'ibm,kernel-dump' rtas node is present only if there is - * dump data waiting for us. - */ - fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); - if (fdm_active) - fw_dump.dump_active = 1; - - /* Get the sizes required to store dump data for the firmware provided - * dump sections. - * For each dump section type supported, a 32bit cell which defines - * the ID of a supported section followed by two 32 bit cells which - * gives teh size of the section in bytes. - */ - sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - &size); - - if (!sections) + if (strcmp(uname, "ibm,opal") == 0) { + opal_fadump_dt_scan(&fw_dump, node); return 1; - - num_sections = size / (3 * sizeof(u32)); - - for (i = 0; i < num_sections; i++, sections += 3) { - u32 type = (u32)of_read_number(sections, 1); - - switch (type) { - case FADUMP_CPU_STATE_DATA: - fw_dump.cpu_state_data_size = - of_read_ulong(§ions[1], 2); - break; - case FADUMP_HPTE_REGION: - fw_dump.hpte_region_size = - of_read_ulong(§ions[1], 2); - break; - } } - return 1; + return 0; } /* * If fadump is registered, check if the memory provided * falls within boot memory area and reserved memory area. */ -int is_fadump_memory_area(u64 addr, ulong size) +int is_fadump_memory_area(u64 addr, unsigned long size) { - u64 d_start = fw_dump.reserve_dump_area_start; - u64 d_end = d_start + fw_dump.reserve_dump_area_size; + u64 d_start, d_end; if (!fw_dump.dump_registered) return 0; + if (!size) + return 0; + + d_start = fw_dump.reserve_dump_area_start; + d_end = d_start + fw_dump.reserve_dump_area_size; if (((addr + size) > d_start) && (addr <= d_end)) return 1; - return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; + return (addr <= fw_dump.boot_mem_top); } int should_fadump_crash(void) @@ -200,31 +159,29 @@ int is_fadump_active(void) } /* - * Returns 1, if there are no holes in boot memory area, - * 0 otherwise. + * Returns true, if there are no holes in memory area between d_start to d_end, + * false otherwise. */ -static int is_boot_memory_area_contiguous(void) +static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) { struct memblock_region *reg; - unsigned long tstart, tend; - unsigned long start_pfn = PHYS_PFN(RMA_START); - unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); - unsigned int ret = 0; + bool ret = false; + u64 start, end; for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart < tend) { - /* Memory hole from start_pfn to tstart */ - if (tstart > start_pfn) + start = max_t(u64, d_start, reg->base); + end = min_t(u64, d_end, (reg->base + reg->size)); + if (d_start < end) { + /* Memory hole from d_start to start */ + if (start > d_start) break; - if (tend == end_pfn) { - ret = 1; + if (end == d_end) { + ret = true; break; } - start_pfn = tend + 1; + d_start = end + 1; } } @@ -232,37 +189,45 @@ static int is_boot_memory_area_contiguous(void) } /* - * Returns true, if there are no holes in reserved memory area, + * Returns true, if there are no holes in boot memory area, * false otherwise. */ -static bool is_reserved_memory_area_contiguous(void) +bool is_fadump_boot_mem_contiguous(void) { - struct memblock_region *reg; - unsigned long start, end; - unsigned long d_start = fw_dump.reserve_dump_area_start; - unsigned long d_end = d_start + fw_dump.reserve_dump_area_size; - - for_each_memblock(memory, reg) { - start = max(d_start, (unsigned long)reg->base); - end = min(d_end, (unsigned long)(reg->base + reg->size)); - if (d_start < end) { - /* Memory hole from d_start to start */ - if (start > d_start) - break; + unsigned long d_start, d_end; + bool ret = false; + int i; - if (end == d_end) - return true; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + d_start = fw_dump.boot_mem_addr[i]; + d_end = d_start + fw_dump.boot_mem_sz[i]; - d_start = end + 1; - } + ret = is_fadump_mem_area_contiguous(d_start, d_end); + if (!ret) + break; } - return false; + return ret; +} + +/* + * Returns true, if there are no holes in reserved memory area, + * false otherwise. + */ +bool is_fadump_reserved_mem_contiguous(void) +{ + u64 d_start, d_end; + + d_start = fw_dump.reserve_dump_area_start; + d_end = d_start + fw_dump.reserve_dump_area_size; + return is_fadump_mem_area_contiguous(d_start, d_end); } /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { + int i; + pr_debug("Support for firmware-assisted dump (fadump): %s\n", (fw_dump.fadump_supported ? "present" : "no support")); @@ -276,62 +241,13 @@ static void fadump_show_config(void) pr_debug("Dump section sizes:\n"); pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); - pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); -} - -static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, - unsigned long addr) -{ - if (!fdm) - return 0; - - memset(fdm, 0, sizeof(struct fadump_mem_struct)); - addr = addr & PAGE_MASK; - - fdm->header.dump_format_version = cpu_to_be32(0x00000001); - fdm->header.dump_num_sections = cpu_to_be16(3); - fdm->header.dump_status_flag = 0; - fdm->header.offset_first_dump_section = - cpu_to_be32((u32)offsetof(struct fadump_mem_struct, cpu_state_data)); - - /* - * Fields for disk dump option. - * We are not using disk dump option, hence set these fields to 0. - */ - fdm->header.dd_block_size = 0; - fdm->header.dd_block_offset = 0; - fdm->header.dd_num_blocks = 0; - fdm->header.dd_offset_disk_path = 0; - - /* set 0 to disable an automatic dump-reboot. */ - fdm->header.max_time_auto = 0; - - /* Kernel dump sections */ - /* cpu state data section. */ - fdm->cpu_state_data.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->cpu_state_data.source_data_type = cpu_to_be16(FADUMP_CPU_STATE_DATA); - fdm->cpu_state_data.source_address = 0; - fdm->cpu_state_data.source_len = cpu_to_be64(fw_dump.cpu_state_data_size); - fdm->cpu_state_data.destination_address = cpu_to_be64(addr); - addr += fw_dump.cpu_state_data_size; - - /* hpte region section */ - fdm->hpte_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->hpte_region.source_data_type = cpu_to_be16(FADUMP_HPTE_REGION); - fdm->hpte_region.source_address = 0; - fdm->hpte_region.source_len = cpu_to_be64(fw_dump.hpte_region_size); - fdm->hpte_region.destination_address = cpu_to_be64(addr); - addr += fw_dump.hpte_region_size; - - /* RMA region section */ - fdm->rmr_region.request_flag = cpu_to_be32(FADUMP_REQUEST_FLAG); - fdm->rmr_region.source_data_type = cpu_to_be16(FADUMP_REAL_MODE_REGION); - fdm->rmr_region.source_address = cpu_to_be64(RMA_START); - fdm->rmr_region.source_len = cpu_to_be64(fw_dump.boot_memory_size); - fdm->rmr_region.destination_address = cpu_to_be64(addr); - addr += fw_dump.boot_memory_size; - - return addr; + pr_debug(" Boot memory size : %lx\n", fw_dump.boot_memory_size); + pr_debug(" Boot memory top : %llx\n", fw_dump.boot_mem_top); + pr_debug("Boot memory regions cnt: %llx\n", fw_dump.boot_mem_regs_cnt); + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + pr_debug("[%03d] base = %llx, size = %llx\n", i, + fw_dump.boot_mem_addr[i], fw_dump.boot_mem_sz[i]); + } } /** @@ -349,10 +265,10 @@ static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, * that is required for a kernel to boot successfully. * */ -static inline unsigned long fadump_calculate_reserve_size(void) +static inline u64 fadump_calculate_reserve_size(void) { + u64 base, size, bootmem_min; int ret; - unsigned long long base, size; if (fw_dump.reserve_bootvar) pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); @@ -402,7 +318,8 @@ static inline unsigned long fadump_calculate_reserve_size(void) if (memory_limit && size > memory_limit) size = memory_limit; - return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); + bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); + return (size > bootmem_min ? size : bootmem_min); } /* @@ -423,57 +340,136 @@ static unsigned long get_fadump_area_size(void) size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); size = PAGE_ALIGN(size); + + /* This is to hold kernel metadata on platforms that support it */ + size += (fw_dump.ops->fadump_get_metadata_size ? + fw_dump.ops->fadump_get_metadata_size() : 0); return size; } -static void __init fadump_reserve_crash_area(unsigned long base, - unsigned long size) +static int __init add_boot_mem_region(unsigned long rstart, + unsigned long rsize) +{ + int i = fw_dump.boot_mem_regs_cnt++; + + if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) { + fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS; + return 0; + } + + pr_debug("Added boot memory range[%d] [%#016lx-%#016lx)\n", + i, rstart, (rstart + rsize)); + fw_dump.boot_mem_addr[i] = rstart; + fw_dump.boot_mem_sz[i] = rsize; + return 1; +} + +/* + * Firmware usually has a hard limit on the data it can copy per region. + * Honour that by splitting a memory range into multiple regions. + */ +static int __init add_boot_mem_regions(unsigned long mstart, + unsigned long msize) +{ + unsigned long rstart, rsize, max_size; + int ret = 1; + + rstart = mstart; + max_size = fw_dump.max_copy_size ? fw_dump.max_copy_size : msize; + while (msize) { + if (msize > max_size) + rsize = max_size; + else + rsize = msize; + + ret = add_boot_mem_region(rstart, rsize); + if (!ret) + break; + + msize -= rsize; + rstart += rsize; + } + + return ret; +} + +static int __init fadump_get_boot_mem_regions(void) { + unsigned long base, size, cur_size, hole_size, last_end; + unsigned long mem_size = fw_dump.boot_memory_size; struct memblock_region *reg; - unsigned long mstart, mend, msize; + int ret = 1; + + fw_dump.boot_mem_regs_cnt = 0; + last_end = 0; + hole_size = 0; + cur_size = 0; for_each_memblock(memory, reg) { - mstart = max_t(unsigned long, base, reg->base); - mend = reg->base + reg->size; - mend = min(base + size, mend); - - if (mstart < mend) { - msize = mend - mstart; - memblock_reserve(mstart, msize); - pr_info("Reserved %ldMB of memory at %#016lx for saving crash dump\n", - (msize >> 20), mstart); + base = reg->base; + size = reg->size; + hole_size += (base - last_end); + + if ((cur_size + size) >= mem_size) { + size = (mem_size - cur_size); + ret = add_boot_mem_regions(base, size); + break; } + + mem_size -= size; + cur_size += size; + ret = add_boot_mem_regions(base, size); + if (!ret) + break; + + last_end = base + size; } + fw_dump.boot_mem_top = PAGE_ALIGN(fw_dump.boot_memory_size + hole_size); + + return ret; } int __init fadump_reserve_mem(void) { - unsigned long base, size, memory_boundary; + u64 base, size, mem_boundary, bootmem_min, align = PAGE_SIZE; + bool is_memblock_bottom_up = memblock_bottom_up(); + int ret = 1; if (!fw_dump.fadump_enabled) return 0; if (!fw_dump.fadump_supported) { - printk(KERN_INFO "Firmware-assisted dump is not supported on" - " this hardware\n"); - fw_dump.fadump_enabled = 0; - return 0; + pr_info("Firmware-Assisted Dump is not supported on this hardware\n"); + goto error_out; } + /* * Initialize boot memory size * If dump is active then we have already calculated the size during * first kernel. */ - if (fdm_active) - fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len); - else { - fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + if (!fw_dump.dump_active) { + fw_dump.boot_memory_size = + PAGE_ALIGN(fadump_calculate_reserve_size()); #ifdef CONFIG_CMA - if (!fw_dump.nocma) + if (!fw_dump.nocma) { + align = FADUMP_CMA_ALIGNMENT; fw_dump.boot_memory_size = - ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + ALIGN(fw_dump.boot_memory_size, align); + } #endif + + bootmem_min = fw_dump.ops->fadump_get_bootmem_min(); + if (fw_dump.boot_memory_size < bootmem_min) { + pr_err("Can't enable fadump with boot memory size (0x%lx) less than 0x%llx\n", + fw_dump.boot_memory_size, bootmem_min); + goto error_out; + } + + if (!fadump_get_boot_mem_regions()) { + pr_err("Too many holes in boot memory area to enable fadump\n"); + goto error_out; + } } /* @@ -493,10 +489,13 @@ int __init fadump_reserve_mem(void) " dump, now %#016llx\n", memory_limit); } if (memory_limit) - memory_boundary = memory_limit; + mem_boundary = memory_limit; else - memory_boundary = memblock_end_of_DRAM(); + mem_boundary = memblock_end_of_DRAM(); + base = fw_dump.boot_mem_top; + size = get_fadump_area_size(); + fw_dump.reserve_dump_area_size = size; if (fw_dump.dump_active) { pr_info("Firmware-assisted dump is active.\n"); @@ -510,58 +509,55 @@ int __init fadump_reserve_mem(void) #endif /* * If last boot has crashed then reserve all the memory - * above boot_memory_size so that we don't touch it until + * above boot memory size so that we don't touch it until * dump is written to disk by userspace tool. This memory - * will be released for general use once the dump is saved. + * can be released for general use by invalidating fadump. */ - base = fw_dump.boot_memory_size; - size = memory_boundary - base; - fadump_reserve_crash_area(base, size); - - fw_dump.fadumphdr_addr = - be64_to_cpu(fdm_active->rmr_region.destination_address) + - be64_to_cpu(fdm_active->rmr_region.source_len); - pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr); - fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; - } else { - size = get_fadump_area_size(); + fadump_reserve_crash_area(base); + pr_debug("fadumphdr_addr = %#016lx\n", fw_dump.fadumphdr_addr); + pr_debug("Reserve dump area start address: 0x%lx\n", + fw_dump.reserve_dump_area_start); + } else { /* * Reserve memory at an offset closer to bottom of the RAM to - * minimize the impact of memory hot-remove operation. We can't - * use memblock_find_in_range() here since it doesn't allocate - * from bottom to top. + * minimize the impact of memory hot-remove operation. */ - for (base = fw_dump.boot_memory_size; - base <= (memory_boundary - size); - base += size) { - if (memblock_is_region_memory(base, size) && - !memblock_is_region_reserved(base, size)) - break; + memblock_set_bottom_up(true); + base = memblock_find_in_range(base, mem_boundary, size, align); + + /* Restore the previous allocation mode */ + memblock_set_bottom_up(is_memblock_bottom_up); + + if (!base) { + pr_err("Failed to find memory chunk for reservation!\n"); + goto error_out; } - if ((base > (memory_boundary - size)) || - memblock_reserve(base, size)) { - pr_err("Failed to reserve memory\n"); - return 0; + fw_dump.reserve_dump_area_start = base; + + /* + * Calculate the kernel metadata address and register it with + * f/w if the platform supports. + */ + if (fw_dump.ops->fadump_setup_metadata && + (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0)) + goto error_out; + + if (memblock_reserve(base, size)) { + pr_err("Failed to reserve memory!\n"); + goto error_out; } - pr_info("Reserved %ldMB of memory at %ldMB for firmware-" - "assisted dump (System RAM: %ldMB)\n", - (unsigned long)(size >> 20), - (unsigned long)(base >> 20), - (unsigned long)(memblock_phys_mem_size() >> 20)); + pr_info("Reserved %lldMB of memory at %#016llx (System RAM: %lldMB)\n", + (size >> 20), base, (memblock_phys_mem_size() >> 20)); - fw_dump.reserve_dump_area_start = base; - fw_dump.reserve_dump_area_size = size; - return fadump_cma_init(); + ret = fadump_cma_init(); } - return 1; -} -unsigned long __init arch_reserved_kernel_pages(void) -{ - return memblock_reserved_size() / PAGE_SIZE; + return ret; +error_out: + fw_dump.fadump_enabled = 0; + return 0; } /* Look for fadump= cmdline option. */ @@ -596,61 +592,6 @@ static int __init early_fadump_reserve_mem(char *p) } early_param("fadump_reserve_mem", early_fadump_reserve_mem); -static int register_fw_dump(struct fadump_mem_struct *fdm) -{ - int rc, err; - unsigned int wait_time; - - pr_debug("Registering for firmware-assisted kernel dump...\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_REGISTER, fdm, - sizeof(struct fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - - } while (wait_time); - - err = -EIO; - switch (rc) { - default: - pr_err("Failed to register. Unknown Error(%d).\n", rc); - break; - case -1: - printk(KERN_ERR "Failed to register firmware-assisted kernel" - " dump. Hardware Error(%d).\n", rc); - break; - case -3: - if (!is_boot_memory_area_contiguous()) - pr_err("Can't have holes in boot memory area while registering fadump\n"); - else if (!is_reserved_memory_area_contiguous()) - pr_err("Can't have holes in reserved memory area while" - " registering fadump\n"); - - printk(KERN_ERR "Failed to register firmware-assisted kernel" - " dump. Parameter Error(%d).\n", rc); - err = -EINVAL; - break; - case -9: - printk(KERN_ERR "firmware-assisted kernel dump is already " - " registered."); - fw_dump.dump_registered = 1; - err = -EEXIST; - break; - case 0: - printk(KERN_INFO "firmware-assisted kernel dump registration" - " is successful\n"); - fw_dump.dump_registered = 1; - err = 0; - break; - } - return err; -} - void crash_fadump(struct pt_regs *regs, const char *str) { struct fadump_crash_info_header *fdh = NULL; @@ -693,71 +634,10 @@ void crash_fadump(struct pt_regs *regs, const char *str) fdh->online_mask = *cpu_online_mask; - /* Call ibm,os-term rtas call to trigger firmware assisted dump */ - rtas_os_term((char *)str); -} - -#define GPR_MASK 0xffffff0000000000 -static inline int fadump_gpr_index(u64 id) -{ - int i = -1; - char str[3]; - - if ((id & GPR_MASK) == REG_ID("GPR")) { - /* get the digits at the end */ - id &= ~GPR_MASK; - id >>= 24; - str[2] = '\0'; - str[1] = id & 0xff; - str[0] = (id >> 8) & 0xff; - sscanf(str, "%d", &i); - if (i > 31) - i = -1; - } - return i; -} - -static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, - u64 reg_val) -{ - int i; - - i = fadump_gpr_index(reg_id); - if (i >= 0) - regs->gpr[i] = (unsigned long)reg_val; - else if (reg_id == REG_ID("NIA")) - regs->nip = (unsigned long)reg_val; - else if (reg_id == REG_ID("MSR")) - regs->msr = (unsigned long)reg_val; - else if (reg_id == REG_ID("CTR")) - regs->ctr = (unsigned long)reg_val; - else if (reg_id == REG_ID("LR")) - regs->link = (unsigned long)reg_val; - else if (reg_id == REG_ID("XER")) - regs->xer = (unsigned long)reg_val; - else if (reg_id == REG_ID("CR")) - regs->ccr = (unsigned long)reg_val; - else if (reg_id == REG_ID("DAR")) - regs->dar = (unsigned long)reg_val; - else if (reg_id == REG_ID("DSISR")) - regs->dsisr = (unsigned long)reg_val; -} - -static struct fadump_reg_entry* -fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - - while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND")) { - fadump_set_regval(regs, be64_to_cpu(reg_entry->reg_id), - be64_to_cpu(reg_entry->reg_value)); - reg_entry++; - } - reg_entry++; - return reg_entry; + fw_dump.ops->fadump_trigger(fdh, str); } -static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) { struct elf_prstatus prstatus; @@ -772,7 +652,7 @@ static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) return buf; } -static void fadump_update_elfcore_header(char *bufp) +void fadump_update_elfcore_header(char *bufp) { struct elfhdr *elf; struct elf_phdr *phdr; @@ -784,7 +664,7 @@ static void fadump_update_elfcore_header(char *bufp) phdr = (struct elf_phdr *)bufp; if (phdr->p_type == PT_NOTE) { - phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_paddr = __pa(fw_dump.cpu_notes_buf_vaddr); phdr->p_offset = phdr->p_paddr; phdr->p_filesz = fw_dump.cpu_notes_buf_size; phdr->p_memsz = fw_dump.cpu_notes_buf_size; @@ -792,228 +672,100 @@ static void fadump_update_elfcore_header(char *bufp) return; } -static void *fadump_cpu_notes_buf_alloc(unsigned long size) +static void *fadump_alloc_buffer(unsigned long size) { - void *vaddr; + unsigned long count, i; struct page *page; - unsigned long order, count, i; + void *vaddr; - order = get_order(size); - vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + vaddr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); if (!vaddr) return NULL; - count = 1 << order; + count = PAGE_ALIGN(size) / PAGE_SIZE; page = virt_to_page(vaddr); for (i = 0; i < count; i++) - SetPageReserved(page + i); + mark_page_reserved(page + i); return vaddr; } -static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +static void fadump_free_buffer(unsigned long vaddr, unsigned long size) { - struct page *page; - unsigned long order, count, i; - - order = get_order(size); - count = 1 << order; - page = virt_to_page(vaddr); - for (i = 0; i < count; i++) - ClearPageReserved(page + i); - __free_pages(page, order); + free_reserved_area((void *)vaddr, (void *)(vaddr + size), -1, NULL); } -/* - * Read CPU state dump data and convert it into ELF notes. - * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be - * used to access the data to allow for additional fields to be added without - * affecting compatibility. Each list of registers for a CPU starts with - * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, - * 8 Byte ASCII identifier and 8 Byte register value. The register entry - * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part - * of register value. For more details refer to PAPR document. - * - * Only for the crashing cpu we ignore the CPU dump data and get exact - * state from fadump crash info structure populated by first kernel at the - * time of crash. - */ -static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +s32 fadump_setup_cpu_notes_buf(u32 num_cpus) { - struct fadump_reg_save_area_header *reg_header; - struct fadump_reg_entry *reg_entry; - struct fadump_crash_info_header *fdh = NULL; - void *vaddr; - unsigned long addr; - u32 num_cpus, *note_buf; - struct pt_regs regs; - int i, rc = 0, cpu = 0; - - if (!fdm->cpu_state_data.bytes_dumped) - return -EINVAL; - - addr = be64_to_cpu(fdm->cpu_state_data.destination_address); - vaddr = __va(addr); - - reg_header = vaddr; - if (be64_to_cpu(reg_header->magic_number) != REGSAVE_AREA_MAGIC) { - printk(KERN_ERR "Unable to read register save area.\n"); - return -ENOENT; - } - pr_debug("--------CPU State Data------------\n"); - pr_debug("Magic Number: %llx\n", be64_to_cpu(reg_header->magic_number)); - pr_debug("NumCpuOffset: %x\n", be32_to_cpu(reg_header->num_cpu_offset)); - - vaddr += be32_to_cpu(reg_header->num_cpu_offset); - num_cpus = be32_to_cpu(*((__be32 *)(vaddr))); - pr_debug("NumCpus : %u\n", num_cpus); - vaddr += sizeof(u32); - reg_entry = (struct fadump_reg_entry *)vaddr; - /* Allocate buffer to hold cpu crash notes. */ fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); - note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); - if (!note_buf) { - printk(KERN_ERR "Failed to allocate 0x%lx bytes for " - "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf_vaddr = + (unsigned long)fadump_alloc_buffer(fw_dump.cpu_notes_buf_size); + if (!fw_dump.cpu_notes_buf_vaddr) { + pr_err("Failed to allocate %ld bytes for CPU notes buffer\n", + fw_dump.cpu_notes_buf_size); return -ENOMEM; } - fw_dump.cpu_notes_buf = __pa(note_buf); - - pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", - (num_cpus * sizeof(note_buf_t)), note_buf); - - if (fw_dump.fadumphdr_addr) - fdh = __va(fw_dump.fadumphdr_addr); - - for (i = 0; i < num_cpus; i++) { - if (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUSTRT")) { - printk(KERN_ERR "Unable to read CPU state data\n"); - rc = -ENOENT; - goto error_out; - } - /* Lower 4 bytes of reg_value contains logical cpu id */ - cpu = be64_to_cpu(reg_entry->reg_value) & FADUMP_CPU_ID_MASK; - if (fdh && !cpumask_test_cpu(cpu, &fdh->online_mask)) { - SKIP_TO_NEXT_CPU(reg_entry); - continue; - } - pr_debug("Reading register data for cpu %d...\n", cpu); - if (fdh && fdh->crashing_cpu == cpu) { - regs = fdh->regs; - note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - SKIP_TO_NEXT_CPU(reg_entry); - } else { - reg_entry++; - reg_entry = fadump_read_registers(reg_entry, ®s); - note_buf = fadump_regs_to_elf_notes(note_buf, ®s); - } - } - final_note(note_buf); - if (fdh) { - pr_debug("Updating elfcore header (%llx) with cpu notes\n", - fdh->elfcorehdr_addr); - fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); - } + pr_debug("Allocated buffer for cpu notes of size %ld at 0x%lx\n", + fw_dump.cpu_notes_buf_size, + fw_dump.cpu_notes_buf_vaddr); return 0; - -error_out: - fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), - fw_dump.cpu_notes_buf_size); - fw_dump.cpu_notes_buf = 0; - fw_dump.cpu_notes_buf_size = 0; - return rc; - } -/* - * Validate and process the dump data stored by firmware before exporting - * it through '/proc/vmcore'. - */ -static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +void fadump_free_cpu_notes_buf(void) { - struct fadump_crash_info_header *fdh; - int rc = 0; - - if (!fdm_active || !fw_dump.fadumphdr_addr) - return -EINVAL; - - /* Check if the dump data is valid. */ - if ((be16_to_cpu(fdm_active->header.dump_status_flag) == FADUMP_ERROR_FLAG) || - (fdm_active->cpu_state_data.error_flags != 0) || - (fdm_active->rmr_region.error_flags != 0)) { - printk(KERN_ERR "Dump taken by platform is not valid\n"); - return -EINVAL; - } - if ((fdm_active->rmr_region.bytes_dumped != - fdm_active->rmr_region.source_len) || - !fdm_active->cpu_state_data.bytes_dumped) { - printk(KERN_ERR "Dump taken by platform is incomplete\n"); - return -EINVAL; - } - - /* Validate the fadump crash info header */ - fdh = __va(fw_dump.fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - printk(KERN_ERR "Crash info header is not valid.\n"); - return -EINVAL; - } - - rc = fadump_build_cpu_notes(fdm_active); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; + if (!fw_dump.cpu_notes_buf_vaddr) + return; - return 0; + fadump_free_buffer(fw_dump.cpu_notes_buf_vaddr, + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf_vaddr = 0; + fw_dump.cpu_notes_buf_size = 0; } -static void free_crash_memory_ranges(void) +static void fadump_free_mem_ranges(struct fadump_mrange_info *mrange_info) { - kfree(crash_memory_ranges); - crash_memory_ranges = NULL; - crash_memory_ranges_size = 0; - max_crash_mem_ranges = 0; + kfree(mrange_info->mem_ranges); + mrange_info->mem_ranges = NULL; + mrange_info->mem_ranges_sz = 0; + mrange_info->max_mem_ranges = 0; } /* - * Allocate or reallocate crash memory ranges array in incremental units + * Allocate or reallocate mem_ranges array in incremental units * of PAGE_SIZE. */ -static int allocate_crash_memory_ranges(void) +static int fadump_alloc_mem_ranges(struct fadump_mrange_info *mrange_info) { - struct fad_crash_memory_ranges *new_array; + struct fadump_memory_range *new_array; u64 new_size; - new_size = crash_memory_ranges_size + PAGE_SIZE; - pr_debug("Allocating %llu bytes of memory for crash memory ranges\n", - new_size); + new_size = mrange_info->mem_ranges_sz + PAGE_SIZE; + pr_debug("Allocating %llu bytes of memory for %s memory ranges\n", + new_size, mrange_info->name); - new_array = krealloc(crash_memory_ranges, new_size, GFP_KERNEL); + new_array = krealloc(mrange_info->mem_ranges, new_size, GFP_KERNEL); if (new_array == NULL) { - pr_err("Insufficient memory for setting up crash memory ranges\n"); - free_crash_memory_ranges(); + pr_err("Insufficient memory for setting up %s memory ranges\n", + mrange_info->name); + fadump_free_mem_ranges(mrange_info); return -ENOMEM; } - crash_memory_ranges = new_array; - crash_memory_ranges_size = new_size; - max_crash_mem_ranges = (new_size / - sizeof(struct fad_crash_memory_ranges)); + mrange_info->mem_ranges = new_array; + mrange_info->mem_ranges_sz = new_size; + mrange_info->max_mem_ranges = (new_size / + sizeof(struct fadump_memory_range)); return 0; } -static inline int fadump_add_crash_memory(unsigned long long base, - unsigned long long end) +static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, + u64 base, u64 end) { - u64 start, size; + struct fadump_memory_range *mem_ranges = mrange_info->mem_ranges; bool is_adjacent = false; + u64 start, size; if (base == end) return 0; @@ -1022,38 +774,41 @@ static inline int fadump_add_crash_memory(unsigned long long base, * Fold adjacent memory ranges to bring down the memory ranges/ * PT_LOAD segments count. */ - if (crash_mem_ranges) { - start = crash_memory_ranges[crash_mem_ranges - 1].base; - size = crash_memory_ranges[crash_mem_ranges - 1].size; + if (mrange_info->mem_range_cnt) { + start = mem_ranges[mrange_info->mem_range_cnt - 1].base; + size = mem_ranges[mrange_info->mem_range_cnt - 1].size; if ((start + size) == base) is_adjacent = true; } if (!is_adjacent) { /* resize the array on reaching the limit */ - if (crash_mem_ranges == max_crash_mem_ranges) { + if (mrange_info->mem_range_cnt == mrange_info->max_mem_ranges) { int ret; - ret = allocate_crash_memory_ranges(); + ret = fadump_alloc_mem_ranges(mrange_info); if (ret) return ret; + + /* Update to the new resized array */ + mem_ranges = mrange_info->mem_ranges; } start = base; - crash_memory_ranges[crash_mem_ranges].base = start; - crash_mem_ranges++; + mem_ranges[mrange_info->mem_range_cnt].base = start; + mrange_info->mem_range_cnt++; } - crash_memory_ranges[crash_mem_ranges - 1].size = (end - start); - pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", - (crash_mem_ranges - 1), start, end - 1, (end - start)); + mem_ranges[mrange_info->mem_range_cnt - 1].size = (end - start); + pr_debug("%s_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + mrange_info->name, (mrange_info->mem_range_cnt - 1), + start, end - 1, (end - start)); return 0; } -static int fadump_exclude_reserved_area(unsigned long long start, - unsigned long long end) +static int fadump_exclude_reserved_area(u64 start, u64 end) { - unsigned long long ra_start, ra_end; + u64 ra_start, ra_end; int ret = 0; ra_start = fw_dump.reserve_dump_area_start; @@ -1061,18 +816,22 @@ static int fadump_exclude_reserved_area(unsigned long long start, if ((ra_start < end) && (ra_end > start)) { if ((start < ra_start) && (end > ra_end)) { - ret = fadump_add_crash_memory(start, ra_start); + ret = fadump_add_mem_range(&crash_mrange_info, + start, ra_start); if (ret) return ret; - ret = fadump_add_crash_memory(ra_end, end); + ret = fadump_add_mem_range(&crash_mrange_info, + ra_end, end); } else if (start < ra_start) { - ret = fadump_add_crash_memory(start, ra_start); + ret = fadump_add_mem_range(&crash_mrange_info, + start, ra_start); } else if (ra_end < end) { - ret = fadump_add_crash_memory(ra_end, end); + ret = fadump_add_mem_range(&crash_mrange_info, + ra_end, end); } } else - ret = fadump_add_crash_memory(start, end); + ret = fadump_add_mem_range(&crash_mrange_info, start, end); return ret; } @@ -1117,36 +876,36 @@ static int fadump_init_elfcore_header(char *bufp) static int fadump_setup_crash_memory_ranges(void) { struct memblock_region *reg; - unsigned long long start, end; - int ret; + u64 start, end; + int i, ret; pr_debug("Setup crash memory ranges.\n"); - crash_mem_ranges = 0; + crash_mrange_info.mem_range_cnt = 0; /* - * add the first memory chunk (RMA_START through boot_memory_size) as - * a separate memory chunk. The reason is, at the time crash firmware - * will move the content of this memory chunk to different location - * specified during fadump registration. We need to create a separate - * program header for this chunk with the correct offset. + * Boot memory region(s) registered with firmware are moved to + * different location at the time of crash. Create separate program + * header(s) for this memory chunk(s) with the correct offset. */ - ret = fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); - if (ret) - return ret; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + start = fw_dump.boot_mem_addr[i]; + end = start + fw_dump.boot_mem_sz[i]; + ret = fadump_add_mem_range(&crash_mrange_info, start, end); + if (ret) + return ret; + } for_each_memblock(memory, reg) { - start = (unsigned long long)reg->base; - end = start + (unsigned long long)reg->size; + start = (u64)reg->base; + end = start + (u64)reg->size; /* - * skip the first memory chunk that is already added (RMA_START - * through boot_memory_size). This logic needs a relook if and - * when RMA_START changes to a non-zero value. + * skip the memory chunk that is already added + * (0 through boot_memory_top). */ - BUILD_BUG_ON(RMA_START != 0); - if (start < fw_dump.boot_memory_size) { - if (end > fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + if (start < fw_dump.boot_mem_top) { + if (end > fw_dump.boot_mem_top) + start = fw_dump.boot_mem_top; else continue; } @@ -1167,17 +926,35 @@ static int fadump_setup_crash_memory_ranges(void) */ static inline unsigned long fadump_relocate(unsigned long paddr) { - if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) - return be64_to_cpu(fdm.rmr_region.destination_address) + paddr; - else - return paddr; + unsigned long raddr, rstart, rend, rlast, hole_size; + int i; + + hole_size = 0; + rlast = 0; + raddr = paddr; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { + rstart = fw_dump.boot_mem_addr[i]; + rend = rstart + fw_dump.boot_mem_sz[i]; + hole_size += (rstart - rlast); + + if (paddr >= rstart && paddr < rend) { + raddr += fw_dump.boot_mem_dest_addr - hole_size; + break; + } + + rlast = rend; + } + + pr_debug("vmcoreinfo: paddr = 0x%lx, raddr = 0x%lx\n", paddr, raddr); + return raddr; } static int fadump_create_elfcore_headers(char *bufp) { - struct elfhdr *elf; + unsigned long long raddr, offset; struct elf_phdr *phdr; - int i; + struct elfhdr *elf; + int i, j; fadump_init_elfcore_header(bufp); elf = (struct elfhdr *)bufp; @@ -1220,12 +997,14 @@ static int fadump_create_elfcore_headers(char *bufp) (elf->e_phnum)++; /* setup PT_LOAD sections. */ - - for (i = 0; i < crash_mem_ranges; i++) { - unsigned long long mbase, msize; - mbase = crash_memory_ranges[i].base; - msize = crash_memory_ranges[i].size; - + j = 0; + offset = 0; + raddr = fw_dump.boot_mem_addr[0]; + for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) { + u64 mbase, msize; + + mbase = crash_mrange_info.mem_ranges[i].base; + msize = crash_mrange_info.mem_ranges[i].size; if (!msize) continue; @@ -1235,13 +1014,17 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_offset = mbase; - if (mbase == RMA_START) { + if (mbase == raddr) { /* - * The entire RMA region will be moved by firmware - * to the specified destination_address. Hence set - * the correct offset. + * The entire real memory region will be moved by + * firmware to the specified destination_address. + * Hence set the correct offset. */ - phdr->p_offset = be64_to_cpu(fdm.rmr_region.destination_address); + phdr->p_offset = fw_dump.boot_mem_dest_addr + offset; + if (j < (fw_dump.boot_mem_regs_cnt - 1)) { + offset += fw_dump.boot_mem_sz[j]; + raddr = fw_dump.boot_mem_addr[++j]; + } } phdr->p_paddr = mbase; @@ -1263,7 +1046,6 @@ static unsigned long init_fadump_header(unsigned long addr) if (!addr) return 0; - fw_dump.fadumphdr_addr = addr; fdh = __va(addr); addr += sizeof(struct fadump_crash_info_header); @@ -1271,7 +1053,7 @@ static unsigned long init_fadump_header(unsigned long addr) fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; fdh->elfcorehdr_addr = addr; /* We will set the crashing cpu id in crash_fadump() during crash. */ - fdh->crashing_cpu = CPU_UNKNOWN; + fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; return addr; } @@ -1293,7 +1075,8 @@ static int register_fadump(void) if (ret) return ret; - addr = be64_to_cpu(fdm.rmr_region.destination_address) + be64_to_cpu(fdm.rmr_region.source_len); + addr = fw_dump.fadumphdr_addr; + /* Initialize fadump crash info header. */ addr = init_fadump_header(addr); vaddr = __va(addr); @@ -1302,74 +1085,27 @@ static int register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - return register_fw_dump(&fdm); -} - -static int fadump_unregister_dump(struct fadump_mem_struct *fdm) -{ - int rc = 0; - unsigned int wait_time; - - pr_debug("Un-register firmware-assisted dump\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_UNREGISTER, fdm, - sizeof(struct fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - } while (wait_time); - - if (rc) { - printk(KERN_ERR "Failed to un-register firmware-assisted dump." - " unexpected error(%d).\n", rc); - return rc; - } - fw_dump.dump_registered = 0; - return 0; -} - -static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm) -{ - int rc = 0; - unsigned int wait_time; - - pr_debug("Invalidating firmware-assisted dump registration\n"); - - /* TODO: Add upper time limit for the delay */ - do { - rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, - FADUMP_INVALIDATE, fdm, - sizeof(struct fadump_mem_struct)); - - wait_time = rtas_busy_delay_time(rc); - if (wait_time) - mdelay(wait_time); - } while (wait_time); - - if (rc) { - pr_err("Failed to invalidate firmware-assisted dump registration. Unexpected error (%d).\n", rc); - return rc; - } - fw_dump.dump_active = 0; - fdm_active = NULL; - return 0; + pr_debug("Registering for firmware-assisted kernel dump...\n"); + return fw_dump.ops->fadump_register(&fw_dump); } void fadump_cleanup(void) { + if (!fw_dump.fadump_supported) + return; + /* Invalidate the registration only if dump is active. */ if (fw_dump.dump_active) { - /* pass the same memory dump structure provided by platform */ - fadump_invalidate_dump(fdm_active); + pr_debug("Invalidating firmware-assisted dump registration\n"); + fw_dump.ops->fadump_invalidate(&fw_dump); } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ - fadump_unregister_dump(&fdm); - free_crash_memory_ranges(); + fw_dump.ops->fadump_unregister(&fw_dump); + fadump_free_mem_ranges(&crash_mrange_info); } + + if (fw_dump.ops->fadump_cleanup) + fw_dump.ops->fadump_cleanup(&fw_dump); } static void fadump_free_reserved_memory(unsigned long start_pfn, @@ -1394,90 +1130,197 @@ static void fadump_free_reserved_memory(unsigned long start_pfn, /* * Skip memory holes and free memory that was actually reserved. */ -static void fadump_release_reserved_area(unsigned long start, unsigned long end) +static void fadump_release_reserved_area(u64 start, u64 end) { + u64 tstart, tend, spfn, epfn; struct memblock_region *reg; - unsigned long tstart, tend; - unsigned long start_pfn = PHYS_PFN(start); - unsigned long end_pfn = PHYS_PFN(end); + spfn = PHYS_PFN(start); + epfn = PHYS_PFN(end); for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg)); + tend = min_t(u64, epfn, memblock_region_memory_end_pfn(reg)); if (tstart < tend) { fadump_free_reserved_memory(tstart, tend); - if (tend == end_pfn) + if (tend == epfn) break; - start_pfn = tend + 1; + spfn = tend; + } + } +} + +/* + * Sort the mem ranges in-place and merge adjacent ranges + * to minimize the memory ranges count. + */ +static void sort_and_merge_mem_ranges(struct fadump_mrange_info *mrange_info) +{ + struct fadump_memory_range *mem_ranges; + struct fadump_memory_range tmp_range; + u64 base, size; + int i, j, idx; + + if (!reserved_mrange_info.mem_range_cnt) + return; + + /* Sort the memory ranges */ + mem_ranges = mrange_info->mem_ranges; + for (i = 0; i < mrange_info->mem_range_cnt; i++) { + idx = i; + for (j = (i + 1); j < mrange_info->mem_range_cnt; j++) { + if (mem_ranges[idx].base > mem_ranges[j].base) + idx = j; + } + if (idx != i) { + tmp_range = mem_ranges[idx]; + mem_ranges[idx] = mem_ranges[i]; + mem_ranges[i] = tmp_range; + } + } + + /* Merge adjacent reserved ranges */ + idx = 0; + for (i = 1; i < mrange_info->mem_range_cnt; i++) { + base = mem_ranges[i-1].base; + size = mem_ranges[i-1].size; + if (mem_ranges[i].base == (base + size)) + mem_ranges[idx].size += mem_ranges[i].size; + else { + idx++; + if (i == idx) + continue; + + mem_ranges[idx] = mem_ranges[i]; } } + mrange_info->mem_range_cnt = idx + 1; } /* - * Release the memory that was reserved in early boot to preserve the memory - * contents. The released memory will be available for general use. + * Scan reserved-ranges to consider them while reserving/releasing + * memory for FADump. */ -static void fadump_release_memory(unsigned long begin, unsigned long end) +static inline int fadump_scan_reserved_mem_ranges(void) { - unsigned long ra_start, ra_end; + struct device_node *root; + const __be32 *prop; + int len, ret = -1; + unsigned long i; + + root = of_find_node_by_path("/"); + if (!root) + return ret; + + prop = of_get_property(root, "reserved-ranges", &len); + if (!prop) + return ret; + + /* + * Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; + + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); + + if (size) { + ret = fadump_add_mem_range(&reserved_mrange_info, + base, base + size); + if (ret < 0) { + pr_warn("some reserved ranges are ignored!\n"); + break; + } + } + } + + return ret; +} + +/* + * Release the memory that was reserved during early boot to preserve the + * crash'ed kernel's memory contents except reserved dump area (permanent + * reservation) and reserved ranges used by F/W. The released memory will + * be available for general use. + */ +static void fadump_release_memory(u64 begin, u64 end) +{ + u64 ra_start, ra_end, tstart; + int i, ret; + + fadump_scan_reserved_mem_ranges(); ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. + * Add reserved dump area to reserved ranges list + * and exclude all these ranges while releasing memory. */ - if (begin < ra_end && end > ra_start) { - if (begin < ra_start) - fadump_release_reserved_area(begin, ra_start); - if (end > ra_end) - fadump_release_reserved_area(ra_end, end); - } else - fadump_release_reserved_area(begin, end); + ret = fadump_add_mem_range(&reserved_mrange_info, ra_start, ra_end); + if (ret != 0) { + /* + * Not enough memory to setup reserved ranges but the system is + * running shortage of memory. So, release all the memory except + * Reserved dump area (reused for next fadump registration). + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); + + return; + } + + /* Get the reserved ranges list in order first. */ + sort_and_merge_mem_ranges(&reserved_mrange_info); + + /* Exclude reserved ranges and release remaining memory */ + tstart = begin; + for (i = 0; i < reserved_mrange_info.mem_range_cnt; i++) { + ra_start = reserved_mrange_info.mem_ranges[i].base; + ra_end = ra_start + reserved_mrange_info.mem_ranges[i].size; + + if (tstart >= ra_end) + continue; + + if (tstart < ra_start) + fadump_release_reserved_area(tstart, ra_start); + tstart = ra_end; + } + + if (tstart < end) + fadump_release_reserved_area(tstart, end); } static void fadump_invalidate_release_mem(void) { - unsigned long reserved_area_start, reserved_area_end; - unsigned long destination_address; - mutex_lock(&fadump_mutex); if (!fw_dump.dump_active) { mutex_unlock(&fadump_mutex); return; } - destination_address = be64_to_cpu(fdm_active->cpu_state_data.destination_address); fadump_cleanup(); mutex_unlock(&fadump_mutex); + fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM()); + fadump_free_cpu_notes_buf(); + /* - * Save the current reserved memory bounds we will require them - * later for releasing the memory for general use. - */ - reserved_area_start = fw_dump.reserve_dump_area_start; - reserved_area_end = reserved_area_start + - fw_dump.reserve_dump_area_size; - /* - * Setup reserve_dump_area_start and its size so that we can - * reuse this reserved memory for Re-registration. + * Setup kernel metadata and initialize the kernel dump + * memory structure for FADump re-registration. */ - fw_dump.reserve_dump_area_start = destination_address; - fw_dump.reserve_dump_area_size = get_fadump_area_size(); - - fadump_release_memory(reserved_area_start, reserved_area_end); - if (fw_dump.cpu_notes_buf) { - fadump_cpu_notes_buf_free( - (unsigned long)__va(fw_dump.cpu_notes_buf), - fw_dump.cpu_notes_buf_size); - fw_dump.cpu_notes_buf = 0; - fw_dump.cpu_notes_buf_size = 0; - } - /* Initialize the kernel dump memory structure for FAD registration. */ - init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + if (fw_dump.ops->fadump_setup_metadata && + (fw_dump.ops->fadump_setup_metadata(&fw_dump) < 0)) + pr_warn("Failed to setup kernel metadata!\n"); + fw_dump.ops->fadump_init_mem_struct(&fw_dump); } static ssize_t fadump_release_memory_store(struct kobject *kobj, @@ -1528,7 +1371,7 @@ static ssize_t fadump_register_store(struct kobject *kobj, int ret = 0; int input = -1; - if (!fw_dump.fadump_enabled || fdm_active) + if (!fw_dump.fadump_enabled || fw_dump.dump_active) return -EPERM; if (kstrtoint(buf, 0, &input)) @@ -1541,13 +1384,15 @@ static ssize_t fadump_register_store(struct kobject *kobj, if (fw_dump.dump_registered == 0) { goto unlock_out; } + /* Un-register Firmware-assisted dump */ - fadump_unregister_dump(&fdm); + pr_debug("Un-register firmware-assisted dump\n"); + fw_dump.ops->fadump_unregister(&fw_dump); break; case 1: if (fw_dump.dump_registered == 1) { /* Un-register Firmware-assisted dump */ - fadump_unregister_dump(&fdm); + fw_dump.ops->fadump_unregister(&fw_dump); } /* Register Firmware-assisted dump */ ret = register_fadump(); @@ -1564,62 +1409,12 @@ unlock_out: static int fadump_region_show(struct seq_file *m, void *private) { - const struct fadump_mem_struct *fdm_ptr; - if (!fw_dump.fadump_enabled) return 0; mutex_lock(&fadump_mutex); - if (fdm_active) - fdm_ptr = fdm_active; - else { - mutex_unlock(&fadump_mutex); - fdm_ptr = &fdm; - } - - seq_printf(m, - "CPU : [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address), - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) + - be64_to_cpu(fdm_ptr->cpu_state_data.source_len) - 1, - be64_to_cpu(fdm_ptr->cpu_state_data.source_len), - be64_to_cpu(fdm_ptr->cpu_state_data.bytes_dumped)); - seq_printf(m, - "HPTE: [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->hpte_region.destination_address), - be64_to_cpu(fdm_ptr->hpte_region.destination_address) + - be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, - be64_to_cpu(fdm_ptr->hpte_region.source_len), - be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); - seq_printf(m, - "DUMP: [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->rmr_region.destination_address), - be64_to_cpu(fdm_ptr->rmr_region.destination_address) + - be64_to_cpu(fdm_ptr->rmr_region.source_len) - 1, - be64_to_cpu(fdm_ptr->rmr_region.source_len), - be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); - - if (!fdm_active || - (fw_dump.reserve_dump_area_start == - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address))) - goto out; - - /* Dump is active. Show reserved memory region. */ - seq_printf(m, - " : [%#016llx-%#016llx] %#llx bytes, " - "Dumped: %#llx\n", - (unsigned long long)fw_dump.reserve_dump_area_start, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - 1, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - - fw_dump.reserve_dump_area_start, - be64_to_cpu(fdm_ptr->cpu_state_data.destination_address) - - fw_dump.reserve_dump_area_start); -out: - if (fdm_active) - mutex_unlock(&fadump_mutex); + fw_dump.ops->fadump_region_show(&fw_dump, m); + mutex_unlock(&fadump_mutex); return 0; } @@ -1671,16 +1466,15 @@ static void fadump_init_files(void) */ int __init setup_fadump(void) { - if (!fw_dump.fadump_enabled) - return 0; - - if (!fw_dump.fadump_supported) { - printk(KERN_ERR "Firmware-assisted dump is not supported on" - " this hardware\n"); + if (!fw_dump.fadump_supported) return 0; - } + fadump_init_files(); fadump_show_config(); + + if (!fw_dump.fadump_enabled) + return 1; + /* * If dump data is available then see if it is valid and prepare for * saving it to the disk. @@ -1690,14 +1484,75 @@ int __init setup_fadump(void) * if dump process fails then invalidate the registration * and release memory before proceeding for re-registration. */ - if (process_fadump(fdm_active) < 0) + if (fw_dump.ops->fadump_process(&fw_dump) < 0) fadump_invalidate_release_mem(); } /* Initialize the kernel dump memory structure for FAD registration. */ else if (fw_dump.reserve_dump_area_size) - init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); - fadump_init_files(); + fw_dump.ops->fadump_init_mem_struct(&fw_dump); return 1; } subsys_initcall(setup_fadump); +#else /* !CONFIG_PRESERVE_FA_DUMP */ + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, + int depth, void *data) +{ + if ((depth != 1) || (strcmp(uname, "ibm,opal") != 0)) + return 0; + + opal_fadump_dt_scan(&fw_dump, node); + return 1; +} + +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * preserve crash data. The subsequent memory preserving kernel boot + * is likely to process this crash data. + */ +int __init fadump_reserve_mem(void) +{ + if (fw_dump.dump_active) { + /* + * If last boot has crashed then reserve all the memory + * above boot memory to preserve crash data. + */ + pr_info("Preserving crash data for processing in next boot.\n"); + fadump_reserve_crash_area(fw_dump.boot_mem_top); + } else + pr_debug("FADump-aware kernel..\n"); + + return 1; +} +#endif /* CONFIG_PRESERVE_FA_DUMP */ + +/* Preserve everything above the base address */ +static void __init fadump_reserve_crash_area(u64 base) +{ + struct memblock_region *reg; + u64 mstart, msize; + + for_each_memblock(memory, reg) { + mstart = reg->base; + msize = reg->size; + + if ((mstart + msize) < base) + continue; + + if (mstart < base) { + msize -= (base - mstart); + mstart = base; + } + + pr_info("Reserving %lluMB of memory at %#016llx for preserving crash data", + (msize >> 20), mstart); + memblock_reserve(mstart, msize); + } +} + +unsigned long __init arch_reserved_kernel_pages(void) +{ + return memblock_reserved_size() / PAGE_SIZE; +} diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 0bb991ddd264..3235a8da6af7 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -94,6 +94,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) /* enable use of FP after return */ #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index ea065282b303..8bccce6544b5 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -153,35 +153,24 @@ skpinv: addi r6,r6,1 /* Increment */ tlbivax 0,r9 TLBSYNC -/* - * The mapping only needs to be cache-coherent on SMP, except on - * Freescale e500mc derivatives where it's also needed for coherent DMA. - */ -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) -#define M_IF_NEEDED MAS2_M -#else -#define M_IF_NEEDED 0 -#endif - #if defined(ENTRY_MAPPING_BOOT_SETUP) -/* 6. Setup KERNELBASE mapping in TLB1[0] */ +/* 6. Setup kernstart_virt_addr mapping in TLB1[0] */ lis r6,0x1000 /* Set MAS0(TLBSEL) = TLB1(1), ESEL = 0 */ mtspr SPRN_MAS0,r6 lis r6,(MAS1_VALID|MAS1_IPROT)@h ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l mtspr SPRN_MAS1,r6 - lis r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, M_IF_NEEDED)@h - ori r6,r6,MAS2_VAL(PAGE_OFFSET, BOOK3E_PAGESZ_64M, M_IF_NEEDED)@l + lis r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h + ori r6,r6,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l + and r6,r6,r20 + ori r6,r6,MAS2_M_IF_NEEDED@l mtspr SPRN_MAS2,r6 mtspr SPRN_MAS3,r8 tlbwe -/* 7. Jump to KERNELBASE mapping */ - lis r6,(KERNELBASE & ~0xfff)@h - ori r6,r6,(KERNELBASE & ~0xfff)@l - rlwinm r7,r25,0,0x03ffffff - add r6,r7,r6 +/* 7. Jump to kernstart_virt_addr mapping */ + mr r6,r20 #elif defined(ENTRY_MAPPING_KEXEC_SETUP) /* diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index f255e22184b4..0493fcac6409 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -34,7 +34,16 @@ #include "head_32.h" -/* 601 only have IBAT; cr0.eq is set on 601 when using this macro */ +/* 601 only have IBAT */ +#ifdef CONFIG_PPC_BOOK3S_601 +#define LOAD_BAT(n, reg, RA, RB) \ + li RA,0; \ + mtspr SPRN_IBAT##n##U,RA; \ + lwz RA,(n*16)+0(reg); \ + lwz RB,(n*16)+4(reg); \ + mtspr SPRN_IBAT##n##U,RA; \ + mtspr SPRN_IBAT##n##L,RB +#else #define LOAD_BAT(n, reg, RA, RB) \ /* see the comment for clear_bats() -- Cort */ \ li RA,0; \ @@ -44,12 +53,11 @@ lwz RB,(n*16)+4(reg); \ mtspr SPRN_IBAT##n##U,RA; \ mtspr SPRN_IBAT##n##L,RB; \ - beq 1f; \ lwz RA,(n*16)+8(reg); \ lwz RB,(n*16)+12(reg); \ mtspr SPRN_DBAT##n##U,RA; \ - mtspr SPRN_DBAT##n##L,RB; \ -1: + mtspr SPRN_DBAT##n##L,RB +#endif __HEAD .stabs "arch/powerpc/kernel/",N_SO,0,0,0f @@ -264,16 +272,21 @@ __secondary_hold_acknowledge: */ . = 0x200 DO_KVM 0x200 - mtspr SPRN_SPRG_SCRATCH0,r10 - mtspr SPRN_SPRG_SCRATCH1,r11 - mfcr r10 +MachineCheck: + EXCEPTION_PROLOG_0 +#ifdef CONFIG_VMAP_STACK + li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r11 + isync +#endif #ifdef CONFIG_PPC_CHRP mfspr r11, SPRN_SPRG_THREAD + tovirt_vmstack r11, r11 lwz r11, RTAS_SP(r11) cmpwi cr1, r11, 0 bne cr1, 7f #endif /* CONFIG_PPC_CHRP */ - EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_1 for_rtas=1 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP @@ -288,24 +301,21 @@ __secondary_hold_acknowledge: . = 0x300 DO_KVM 0x300 DataAccess: - EXCEPTION_PROLOG - mfspr r10,SPRN_DSISR - stw r10,_DSISR(r11) + EXCEPTION_PROLOG handle_dar_dsisr=1 + get_and_save_dar_dsisr_on_stack r4, r5, r11 +BEGIN_MMU_FTR_SECTION #ifdef CONFIG_PPC_KUAP - andis. r0,r10,(DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h + andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h #else - andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h + andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h #endif - bne 1f /* if not, try to put a PTE */ - mfspr r4,SPRN_DAR /* into the hash table */ - rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ -BEGIN_MMU_FTR_SECTION + bne handle_page_fault_tramp_2 /* if not, try to put a PTE */ + rlwinm r3, r5, 32 - 15, 21, 21 /* DSISR_STORE -> _PAGE_RW */ bl hash_page -END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) -1: lwz r5,_DSISR(r11) /* get DSISR value */ - mfspr r4,SPRN_DAR - EXC_XFER_LITE(0x300, handle_page_fault) - + b handle_page_fault_tramp_1 +FTR_SECTION_ELSE + b handle_page_fault_tramp_2 +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) /* Instruction access exception. */ . = 0x400 @@ -321,6 +331,7 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) 1: mr r4,r12 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ + stw r4, _DAR(r11) EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ @@ -330,11 +341,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) . = 0x600 DO_KVM 0x600 Alignment: - EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) + EXCEPTION_PROLOG handle_dar_dsisr=1 + save_dar_dsisr_on_stack r4, r5, r11 addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x600, alignment_exception) @@ -557,9 +565,9 @@ DataStoreTLBMiss: cmplw 0,r1,r3 mfspr r2, SPRN_SPRG_PGDIR #ifdef CONFIG_SWAP - li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED #else - li r1, _PAGE_RW | _PAGE_PRESENT + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT #endif bge- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ @@ -637,6 +645,16 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) . = 0x3000 +handle_page_fault_tramp_1: + lwz r4, _DAR(r11) + lwz r5, _DSISR(r11) + /* fall through */ +handle_page_fault_tramp_2: + EXC_XFER_LITE(0x300, handle_page_fault) + +stack_overflow: + vmap_stack_overflow_exception + AltiVecUnavailable: EXCEPTION_PROLOG #ifdef CONFIG_ALTIVEC @@ -820,9 +838,6 @@ load_up_mmu: /* Load the BAT registers with the values set up by MMU_init. MMU_init takes care of whether we're on a 601 or not. */ - mfpvr r3 - srwi r3,r3,16 - cmpwi r3,1 lis r3,BATS@ha addi r3,r3,BATS@l tophys(r3,r3) @@ -897,9 +912,11 @@ start_here: bl machine_init bl __save_cpu_setup bl MMU_init +#ifdef CONFIG_KASAN BEGIN_MMU_FTR_SECTION bl MMU_init_hw_patch END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) +#endif /* * Go back to running unmapped so we can load up new values @@ -910,6 +927,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE) ori r4,r4,2f@l tophys(r4,r4) li r3,MSR_KERNEL & ~(MSR_IR|MSR_DR) + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 SYNC @@ -996,11 +1015,8 @@ EXPORT_SYMBOL(switch_mmu_context) */ clear_bats: li r10,0 - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi r9, 1 - beq 1f +#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT0U,r10 mtspr SPRN_DBAT0L,r10 mtspr SPRN_DBAT1U,r10 @@ -1009,7 +1025,7 @@ clear_bats: mtspr SPRN_DBAT2L,r10 mtspr SPRN_DBAT3U,r10 mtspr SPRN_DBAT3L,r10 -1: +#endif mtspr SPRN_IBAT0U,r10 mtspr SPRN_IBAT0L,r10 mtspr SPRN_IBAT1U,r10 @@ -1054,6 +1070,8 @@ _ENTRY(update_bats) rlwinm r0, r6, 0, ~MSR_RI rlwinm r0, r0, 0, ~MSR_EE mtmsr r0 + + .align 4 mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r3 SYNC @@ -1093,6 +1111,8 @@ mmu_off: andi. r0,r3,MSR_DR|MSR_IR /* MMU enabled? */ beqlr andc r3,r3,r0 + + .align 4 mtspr SPRN_SRR0,r4 mtspr SPRN_SRR1,r3 sync @@ -1104,10 +1124,7 @@ mmu_off: */ initial_bats: lis r11,PAGE_OFFSET@h - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi 0,r9,1 - bne 4f +#ifdef CONFIG_PPC_BOOK3S_601 ori r11,r11,4 /* set up BAT registers for 601 */ li r8,0x7f /* valid, block length = 8MB */ mtspr SPRN_IBAT0U,r11 /* N.B. 601 has valid bit in */ @@ -1120,10 +1137,8 @@ initial_bats: addis r8,r8,0x800000@h mtspr SPRN_IBAT2U,r11 mtspr SPRN_IBAT2L,r8 - isync - blr - -4: tophys(r8,r11) +#else + tophys(r8,r11) #ifdef CONFIG_SMP ori r8,r8,0x12 /* R/W access, M=1 */ #else @@ -1135,10 +1150,10 @@ initial_bats: mtspr SPRN_DBAT0U,r11 /* bit in upper BAT register */ mtspr SPRN_IBAT0L,r8 mtspr SPRN_IBAT0U,r11 +#endif isync blr - #ifdef CONFIG_BOOTX_TEXT setup_disp_bat: /* @@ -1153,15 +1168,13 @@ setup_disp_bat: beqlr lwz r11,0(r8) lwz r8,4(r8) - mfspr r9,SPRN_PVR - rlwinm r9,r9,16,16,31 /* r9 = 1 for 601, 4 for 604 */ - cmpwi 0,r9,1 - beq 1f +#ifndef CONFIG_PPC_BOOK3S_601 mtspr SPRN_DBAT3L,r8 mtspr SPRN_DBAT3U,r11 - blr -1: mtspr SPRN_IBAT3L,r8 +#else + mtspr SPRN_IBAT3L,r8 mtspr SPRN_IBAT3U,r11 +#endif blr #endif /* CONFIG_BOOTX_TEXT */ diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 4a692553651f..a6a5fbbf8504 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -5,46 +5,65 @@ #include <asm/ptrace.h> /* for STACK_FRAME_REGS_MARKER */ /* - * MSR_KERNEL is > 0x8000 on 4xx/Book-E since it include MSR_CE. - */ -.macro __LOAD_MSR_KERNEL r, x -.if \x >= 0x8000 - lis \r, (\x)@h - ori \r, \r, (\x)@l -.else - li \r, (\x) -.endif -.endm -#define LOAD_MSR_KERNEL(r, x) __LOAD_MSR_KERNEL r, x - -/* * Exception entry code. This code runs with address translation * turned off, i.e. using physical addresses. * We assume sprg3 has the physical address of the current * task's thread_struct. */ +.macro EXCEPTION_PROLOG handle_dar_dsisr=0 + EXCEPTION_PROLOG_0 handle_dar_dsisr=\handle_dar_dsisr + EXCEPTION_PROLOG_1 + EXCEPTION_PROLOG_2 handle_dar_dsisr=\handle_dar_dsisr +.endm -.macro EXCEPTION_PROLOG +.macro EXCEPTION_PROLOG_0 handle_dar_dsisr=0 mtspr SPRN_SPRG_SCRATCH0,r10 mtspr SPRN_SPRG_SCRATCH1,r11 +#ifdef CONFIG_VMAP_STACK + mfspr r10, SPRN_SPRG_THREAD + .if \handle_dar_dsisr + mfspr r11, SPRN_DAR + stw r11, DAR(r10) + mfspr r11, SPRN_DSISR + stw r11, DSISR(r10) + .endif + mfspr r11, SPRN_SRR0 + stw r11, SRR0(r10) +#endif + mfspr r11, SPRN_SRR1 /* check whether user or kernel */ +#ifdef CONFIG_VMAP_STACK + stw r11, SRR1(r10) +#endif mfcr r10 - EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + andi. r11, r11, MSR_PR .endm -.macro EXCEPTION_PROLOG_1 - mfspr r11,SPRN_SRR1 /* check whether user or kernel */ - andi. r11,r11,MSR_PR +.macro EXCEPTION_PROLOG_1 for_rtas=0 +#ifdef CONFIG_VMAP_STACK + .ifeq \for_rtas + li r11, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r11 + isync + .endif + subi r11, r1, INT_FRAME_SIZE /* use r1 if kernel */ +#else tophys(r11,r1) /* use tophys(r1) if kernel */ + subi r11, r11, INT_FRAME_SIZE /* alloc exc. frame */ +#endif beq 1f mfspr r11,SPRN_SPRG_THREAD + tovirt_vmstack r11, r11 lwz r11,TASK_STACK-THREAD(r11) - addi r11,r11,THREAD_SIZE - tophys(r11,r11) -1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ + addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE + tophys_novmstack r11, r11 +1: +#ifdef CONFIG_VMAP_STACK + mtcrf 0x7f, r11 + bt 32 - THREAD_ALIGN_SHIFT, stack_overflow +#endif .endm -.macro EXCEPTION_PROLOG_2 +.macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0 stw r10,_CCR(r11) /* save registers */ stw r12,GPR12(r11) stw r9,GPR9(r11) @@ -54,16 +73,33 @@ stw r12,GPR11(r11) mflr r10 stw r10,_LINK(r11) +#ifdef CONFIG_VMAP_STACK + mfspr r12, SPRN_SPRG_THREAD + tovirt(r12, r12) + .if \handle_dar_dsisr + lwz r10, DAR(r12) + stw r10, _DAR(r11) + lwz r10, DSISR(r12) + stw r10, _DSISR(r11) + .endif + lwz r9, SRR1(r12) + lwz r12, SRR0(r12) +#else mfspr r12,SPRN_SRR0 mfspr r9,SPRN_SRR1 +#endif stw r1,GPR1(r11) stw r1,0(r11) - tovirt(r1,r11) /* set new kernel sp */ + tovirt_novmstack r1, r11 /* set new kernel sp */ #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else +#ifdef CONFIG_VMAP_STACK + li r10, MSR_KERNEL & ~MSR_IR /* can take exceptions */ +#else li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR) /* can take exceptions */ - MTMSRD(r10) /* (except for mach check in rtas) */ +#endif + mtmsr r10 /* (except for mach check in rtas) */ #endif stw r0,GPR0(r11) lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ @@ -75,25 +111,46 @@ .macro SYSCALL_ENTRY trapno mfspr r12,SPRN_SPRG_THREAD +#ifdef CONFIG_VMAP_STACK + mfspr r9, SPRN_SRR0 + mfspr r11, SPRN_SRR1 + stw r9, SRR0(r12) + stw r11, SRR1(r12) +#endif mfcr r10 lwz r11,TASK_STACK-THREAD(r12) - mflr r9 - addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE rlwinm r10,r10,0,4,2 /* Clear SO bit in CR */ - tophys(r11,r11) + addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + li r9, MSR_KERNEL & ~(MSR_IR | MSR_RI) /* can take DTLB miss */ + mtmsr r9 + isync +#endif + tovirt_vmstack r12, r12 + tophys_novmstack r11, r11 + mflr r9 stw r10,_CCR(r11) /* save registers */ + stw r9, _LINK(r11) +#ifdef CONFIG_VMAP_STACK + lwz r10, SRR0(r12) + lwz r9, SRR1(r12) +#else mfspr r10,SPRN_SRR0 - stw r9,_LINK(r11) mfspr r9,SPRN_SRR1 +#endif stw r1,GPR1(r11) stw r1,0(r11) - tovirt(r1,r11) /* set new kernel sp */ + tovirt_novmstack r1, r11 /* set new kernel sp */ stw r10,_NIP(r11) #ifdef CONFIG_40x rlwinm r9,r9,0,14,12 /* clear MSR_WE (necessary?) */ #else - LOAD_MSR_KERNEL(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ - MTMSRD(r10) /* (except for mach check in rtas) */ +#ifdef CONFIG_VMAP_STACK + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~MSR_IR) /* can take exceptions */ +#else + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL & ~(MSR_IR|MSR_DR)) /* can take exceptions */ +#endif + mtmsr r10 /* (except for mach check in rtas) */ #endif lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ stw r2,GPR2(r11) @@ -131,7 +188,7 @@ #endif 3: - tovirt(r2, r2) /* set r2 to current */ + tovirt_novmstack r2, r2 /* set r2 to current */ lis r11, transfer_to_syscall@h ori r11, r11, transfer_to_syscall@l #ifdef CONFIG_TRACE_IRQFLAGS @@ -140,10 +197,10 @@ * otherwise we might risk taking an interrupt before we tell lockdep * they are enabled. */ - LOAD_MSR_KERNEL(r10, MSR_KERNEL) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL) rlwimi r10, r9, 0, MSR_EE #else - LOAD_MSR_KERNEL(r10, MSR_KERNEL | MSR_EE) + LOAD_REG_IMMEDIATE(r10, MSR_KERNEL | MSR_EE) #endif #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 @@ -154,6 +211,54 @@ RFI /* jump to handler, enable MMU */ .endm +.macro save_dar_dsisr_on_stack reg1, reg2, sp +#ifndef CONFIG_VMAP_STACK + mfspr \reg1, SPRN_DAR + mfspr \reg2, SPRN_DSISR + stw \reg1, _DAR(\sp) + stw \reg2, _DSISR(\sp) +#endif +.endm + +.macro get_and_save_dar_dsisr_on_stack reg1, reg2, sp +#ifdef CONFIG_VMAP_STACK + lwz \reg1, _DAR(\sp) + lwz \reg2, _DSISR(\sp) +#else + save_dar_dsisr_on_stack \reg1, \reg2, \sp +#endif +.endm + +.macro tovirt_vmstack dst, src +#ifdef CONFIG_VMAP_STACK + tovirt(\dst, \src) +#else + .ifnc \dst, \src + mr \dst, \src + .endif +#endif +.endm + +.macro tovirt_novmstack dst, src +#ifndef CONFIG_VMAP_STACK + tovirt(\dst, \src) +#else + .ifnc \dst, \src + mr \dst, \src + .endif +#endif +.endm + +.macro tophys_novmstack dst, src +#ifndef CONFIG_VMAP_STACK + tophys(\dst, \src) +#else + .ifnc \dst, \src + mr \dst, \src + .endif +#endif +.endm + /* * Note: code which follows this uses cr0.eq (set if from kernel), * r11, r12 (SRR0), and r9 (SRR1). @@ -187,7 +292,7 @@ label: #define EXC_XFER_TEMPLATE(hdlr, trap, msr, tfer, ret) \ li r10,trap; \ stw r10,_TRAP(r11); \ - LOAD_MSR_KERNEL(r10, msr); \ + LOAD_REG_IMMEDIATE(r10, msr); \ bl tfer; \ .long hdlr; \ .long ret @@ -200,4 +305,28 @@ label: EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, transfer_to_handler, \ ret_from_except) +.macro vmap_stack_overflow_exception +#ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_SMP + mfspr r11, SPRN_SPRG_THREAD + tovirt(r11, r11) + lwz r11, TASK_CPU - THREAD(r11) + slwi r11, r11, 3 + addis r11, r11, emergency_ctx@ha +#else + lis r11, emergency_ctx@ha +#endif + lwz r11, emergency_ctx@l(r11) + cmpwi cr1, r11, 0 + bne cr1, 1f + lis r11, init_thread_union@ha + addi r11, r11, init_thread_union@l +1: addi r11, r11, THREAD_SIZE - INT_FRAME_SIZE + EXCEPTION_PROLOG_2 + SAVE_NVGPRS(r11) + addi r3, r1, STACK_FRAME_OVERHEAD + EXC_XFER_STD(0, stack_overflow_exception) +#endif +.endm + #endif /* __HEAD_32_H__ */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 585ea1976550..9bb663977e84 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -313,6 +313,7 @@ _ENTRY(saved_ksp_limit) START_EXCEPTION(0x0400, InstructionAccess) EXCEPTION_PROLOG mr r4,r12 /* Pass SRR0 as arg2 */ + stw r4, _DEAR(r11) li r5,0 /* Pass zero as arg3 */ EXC_XFER_LITE(0x400, handle_page_fault) @@ -676,6 +677,7 @@ DataAccess: mfspr r5,SPRN_ESR /* Grab the ESR, save it, pass arg3 */ stw r5,_ESR(r11) mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ + stw r4, _DEAR(r11) EXC_XFER_LITE(0x300, handle_page_fault) /* Other PowerPC processors, namely those derived from the 6xx-series diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 91d297e696dd..ad79fddb974d 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -182,7 +182,8 @@ __secondary_hold: isync bctr #else - BUG_OPCODE +0: trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 #endif CLOSE_FIXED_SECTION(first_256B) @@ -635,7 +636,7 @@ __after_prom_start: sub r5,r5,r11 #else /* just copy interrupts */ - LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) + LOAD_REG_IMMEDIATE_SYM(r5, r11, FIXED_SYMBOL_ABS_ADDR(__end_interrupts)) #endif b 5f 3: @@ -998,7 +999,8 @@ start_here_common: bl start_kernel /* Not reached */ - BUG_OPCODE + trap + EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 /* * We put a few things here that have to be page-aligned. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 5ab9178c2347..9922306ae512 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -15,6 +15,7 @@ */ #include <linux/init.h> +#include <linux/magic.h> #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> @@ -126,56 +127,36 @@ instruction_counter: /* Machine check */ . = 0x200 MachineCheck: - EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - li r5,RPN_PATTERN - mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) + EXCEPTION_PROLOG handle_dar_dsisr=1 + save_dar_dsisr_on_stack r4, r5, r11 + li r6, RPN_PATTERN + mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ addi r3,r1,STACK_FRAME_OVERHEAD EXC_XFER_STD(0x200, machine_check_exception) -/* Data access exception. - * This is "never generated" by the MPC8xx. - */ - . = 0x300 -DataAccess: - -/* Instruction access exception. - * This is "never generated" by the MPC8xx. - */ - . = 0x400 -InstructionAccess: - /* External interrupt */ EXCEPTION(0x500, HardwareInterrupt, do_IRQ, EXC_XFER_LITE) /* Alignment exception */ . = 0x600 Alignment: - EXCEPTION_PROLOG - mfspr r4,SPRN_DAR - stw r4,_DAR(r11) - li r5,RPN_PATTERN - mtspr SPRN_DAR,r5 /* Tag DAR, to be used in DTLB Error */ - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) + EXCEPTION_PROLOG handle_dar_dsisr=1 + save_dar_dsisr_on_stack r4, r5, r11 + li r6, RPN_PATTERN + mtspr SPRN_DAR, r6 /* Tag DAR, to be used in DTLB Error */ addi r3,r1,STACK_FRAME_OVERHEAD - EXC_XFER_STD(0x600, alignment_exception) + b .Lalignment_exception_ool /* Program check exception */ EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD) -/* No FPU on MPC8xx. This exception is not supposed to happen. -*/ - EXCEPTION(0x800, FPUnavailable, unknown_exception, EXC_XFER_STD) - /* Decrementer */ EXCEPTION(0x900, Decrementer, timer_interrupt, EXC_XFER_LITE) - EXCEPTION(0xa00, Trap_0a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xb00, Trap_0b, unknown_exception, EXC_XFER_STD) + /* With VMAP_STACK there's not enough room for this at 0x600 */ + . = 0xa00 +.Lalignment_exception_ool: + EXC_XFER_STD(0x600, alignment_exception) /* System call */ . = 0xc00 @@ -184,25 +165,12 @@ SystemCall: /* Single step - not used on 601 */ EXCEPTION(0xd00, SingleStep, single_step_exception, EXC_XFER_STD) - EXCEPTION(0xe00, Trap_0e, unknown_exception, EXC_XFER_STD) - EXCEPTION(0xf00, Trap_0f, unknown_exception, EXC_XFER_STD) /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. */ EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) -/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ -#ifdef CONFIG_PERF_EVENTS - patch_site 0f, patch__dtlbmiss_perf -0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - addi r10, r10, 1 - stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi -#endif - . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction @@ -342,8 +310,8 @@ ITLBMissLinear: . = 0x1200 DataStoreTLBMiss: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 + mtspr SPRN_DAR, r10 + mtspr SPRN_M_TW, r11 mfcr r11 /* If we are faulting a kernel address, we have to use the @@ -408,10 +376,10 @@ DataStoreTLBMiss: mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ /* Restore registers */ - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_1 @@ -427,10 +395,10 @@ DTLBMissIMMR: mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_2 @@ -464,10 +432,10 @@ DTLBMissLinear: mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ li r11, RPN_PATTERN - mtspr SPRN_DAR, r11 /* Tag DAR */ -0: mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 +0: mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW rfi patch_site 0b, patch__dtlbmiss_exit_3 @@ -485,6 +453,7 @@ InstructionTLBError: tlbie r4 /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ .Litlbie: + stw r4, _DAR(r11) EXC_XFER_LITE(0x400, handle_page_fault) /* This is the data TLB error on the MPC8xx. This could be due to @@ -493,58 +462,69 @@ InstructionTLBError: */ . = 0x1400 DataTLBError: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 - mfcr r10 - + EXCEPTION_PROLOG_0 handle_dar_dsisr=1 mfspr r11, SPRN_DAR - cmpwi cr0, r11, RPN_PATTERN - beq- FixupDAR /* must be a buggy dcbX, icbi insn. */ + cmpwi cr1, r11, RPN_PATTERN + beq- cr1, FixupDAR /* must be a buggy dcbX, icbi insn. */ DARFixed:/* Return from dcbx instruction bug workaround */ +#ifdef CONFIG_VMAP_STACK + li r11, RPN_PATTERN + mtspr SPRN_DAR, r11 /* Tag DAR, to be used in DTLB Error */ +#endif EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 - mfspr r5,SPRN_DSISR - stw r5,_DSISR(r11) - mfspr r4,SPRN_DAR + EXCEPTION_PROLOG_2 handle_dar_dsisr=1 + get_and_save_dar_dsisr_on_stack r4, r5, r11 andis. r10,r5,DSISR_NOHPTE@h beq+ .Ldtlbie tlbie r4 .Ldtlbie: +#ifndef CONFIG_VMAP_STACK li r10,RPN_PATTERN mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ +#endif /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) - EXCEPTION(0x1500, Trap_15, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1600, Trap_16, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1700, Trap_17, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1800, Trap_18, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1900, Trap_19, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1a00, Trap_1a, unknown_exception, EXC_XFER_STD) - EXCEPTION(0x1b00, Trap_1b, unknown_exception, EXC_XFER_STD) +/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ +#ifdef CONFIG_PERF_EVENTS + patch_site 0f, patch__dtlbmiss_perf +0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + addi r10, r10, 1 + stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + mfspr r10, SPRN_DAR + mtspr SPRN_DAR, r11 /* Tag DAR */ + mfspr r11, SPRN_M_TW + rfi +#endif + +stack_overflow: + vmap_stack_overflow_exception /* On the MPC8xx, these next four traps are used for development * support of breakpoints and such. Someday I will get around to * using them. */ - . = 0x1c00 -DataBreakpoint: - mtspr SPRN_SPRG_SCRATCH0, r10 - mtspr SPRN_SPRG_SCRATCH1, r11 - mfcr r10 - mfspr r11, SPRN_SRR0 - cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l - cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l - beq- cr0, 11f - beq- cr7, 11f +do_databreakpoint: EXCEPTION_PROLOG_1 - EXCEPTION_PROLOG_2 + EXCEPTION_PROLOG_2 handle_dar_dsisr=1 addi r3,r1,STACK_FRAME_OVERHEAD mfspr r4,SPRN_BAR stw r4,_DAR(r11) +#ifdef CONFIG_VMAP_STACK + lwz r5,_DSISR(r11) +#else mfspr r5,SPRN_DSISR +#endif EXC_XFER_STD(0x1c00, do_break) -11: + + . = 0x1c00 +DataBreakpoint: + EXCEPTION_PROLOG_0 handle_dar_dsisr=1 + mfspr r11, SPRN_SRR0 + cmplwi cr1, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l + cror 4*cr1+eq, 4*cr1+eq, 4*cr7+eq + bne cr1, do_databreakpoint mtcr r10 mfspr r10, SPRN_SPRG_SCRATCH0 mfspr r11, SPRN_SPRG_SCRATCH1 @@ -574,17 +554,15 @@ InstructionBreakpoint: * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address. */ - /* define if you don't want to use self modifying code */ -#define NO_SELF_MODIFYING_CODE FixupDAR:/* Entry point for dcbx workaround. */ mtspr SPRN_M_TW, r10 /* fetch instruction from memory. */ mfspr r10, SPRN_SRR0 mtspr SPRN_MD_EPN, r10 rlwinm r11, r10, 16, 0xfff8 - cmpli cr0, r11, PAGE_OFFSET@h + cmpli cr1, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TWB /* Get level 1 table */ - blt+ 3f + blt+ cr1, 3f rlwinm r11, r10, 16, 0xfff8 0: cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h @@ -599,7 +577,7 @@ FixupDAR:/* Entry point for dcbx workaround. */ 3: lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11) /* Get the level 1 entry */ mtspr SPRN_MD_TWC, r11 - mtcr r11 + mtcrf 0x01, r11 mfspr r11, SPRN_MD_TWC lwz r11, 0(r11) /* Get the pte */ bt 28,200f /* bit 28 = Large page (8M) */ @@ -612,16 +590,16 @@ FixupDAR:/* Entry point for dcbx workaround. */ * no need to include them here */ xoris r10, r11, 0x7c00 /* check if major OP code is 31 */ rlwinm r10, r10, 0, 21, 5 - cmpwi cr0, r10, 2028 /* Is dcbz? */ - beq+ 142f - cmpwi cr0, r10, 940 /* Is dcbi? */ - beq+ 142f - cmpwi cr0, r10, 108 /* Is dcbst? */ - beq+ 144f /* Fix up store bit! */ - cmpwi cr0, r10, 172 /* Is dcbf? */ - beq+ 142f - cmpwi cr0, r10, 1964 /* Is icbi? */ - beq+ 142f + cmpwi cr1, r10, 2028 /* Is dcbz? */ + beq+ cr1, 142f + cmpwi cr1, r10, 940 /* Is dcbi? */ + beq+ cr1, 142f + cmpwi cr1, r10, 108 /* Is dcbst? */ + beq+ cr1, 144f /* Fix up store bit! */ + cmpwi cr1, r10, 172 /* Is dcbf? */ + beq+ cr1, 142f + cmpwi cr1, r10, 1964 /* Is icbi? */ + beq+ cr1, 142f 141: mfspr r10,SPRN_M_TW b DARFixed /* Nope, go back to normal TLB processing */ @@ -639,27 +617,6 @@ FixupDAR:/* Entry point for dcbx workaround. */ rlwinm r10, r10,0,7,5 /* Clear store bit for buggy dcbst insn */ mtspr SPRN_DSISR, r10 142: /* continue, it was a dcbx, dcbi instruction. */ -#ifndef NO_SELF_MODIFYING_CODE - andis. r10,r11,0x1f /* test if reg RA is r0 */ - li r10,modified_instr@l - dcbtst r0,r10 /* touch for store */ - rlwinm r11,r11,0,0,20 /* Zero lower 10 bits */ - oris r11,r11,640 /* Transform instr. to a "add r10,RA,RB" */ - ori r11,r11,532 - stw r11,0(r10) /* store add/and instruction */ - dcbf 0,r10 /* flush new instr. to memory. */ - icbi 0,r10 /* invalidate instr. cache line */ - mfspr r11, SPRN_SPRG_SCRATCH1 /* restore r11 */ - mfspr r10, SPRN_SPRG_SCRATCH0 /* restore r10 */ - isync /* Wait until new instr is loaded from memory */ -modified_instr: - .space 4 /* this is where the add instr. is stored */ - bne+ 143f - subf r10,r0,r10 /* r10=r10-r0, only if reg RA is r0 */ -143: mtdar r10 /* store faulting EA in DAR */ - mfspr r10,SPRN_M_TW - b DARFixed /* Go back to normal TLB handling */ -#else mfctr r10 mtdar r10 /* save ctr reg in DAR */ rlwinm r10, r11, 24, 24, 28 /* offset into jump table for reg RB */ @@ -701,8 +658,9 @@ modified_instr: add r10, r10, r30 ;b 151f add r10, r10, r31 151: - rlwinm. r11,r11,19,24,28 /* offset into jump table for reg RA */ - beq 152f /* if reg RA is zero, don't add it */ + rlwinm r11,r11,19,24,28 /* offset into jump table for reg RA */ + cmpwi cr1, r11, 0 + beq cr1, 152f /* if reg RA is zero, don't add it */ addi r11, r11, 150b@l /* add start of table */ mtctr r11 /* load ctr with jump address */ rlwinm r11,r11,0,16,10 /* make sure we don't execute this more than once */ @@ -710,7 +668,14 @@ modified_instr: 152: mfdar r11 mtctr r11 /* restore ctr reg from DAR */ +#ifdef CONFIG_VMAP_STACK + mfspr r11, SPRN_SPRG_THREAD + stw r10, DAR(r11) + mfspr r10, SPRN_DSISR + stw r10, DSISR(r11) +#else mtdar r10 /* save fault EA to DAR */ +#endif mfspr r10,SPRN_M_TW b DARFixed /* Go back to normal TLB handling */ @@ -723,7 +688,6 @@ modified_instr: add r10, r10, r11 /* add it */ mfctr r11 /* restore r11 */ b 151b -#endif /* * This is where the main kernel code starts. @@ -741,6 +705,9 @@ start_here: /* stack */ lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l + lis r0, STACK_END_MAGIC@h + ori r0, r0, STACK_END_MAGIC@l + stw r0, 0(r1) li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 2ae635df9026..37fc84ed90e3 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -467,6 +467,7 @@ label: mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mfspr r4,SPRN_DEAR; /* Grab the DEAR */ \ + stw r4, _DEAR(r11); \ EXC_XFER_LITE(0x0300, handle_page_fault) #define INSTRUCTION_STORAGE_EXCEPTION \ @@ -475,6 +476,7 @@ label: mfspr r5,SPRN_ESR; /* Grab the ESR and save it */ \ stw r5,_ESR(r11); \ mr r4,r12; /* Pass SRR0 as arg2 */ \ + stw r4, _DEAR(r11); \ li r5,0; /* Pass zero as arg3 */ \ EXC_XFER_LITE(0x0400, handle_page_fault) diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index adf0505dbe02..840af004041e 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -155,6 +155,8 @@ _ENTRY(_start); */ _ENTRY(__early_start) + LOAD_REG_ADDR_PIC(r20, kernstart_virt_addr) + lwz r20,0(r20) #define ENTRY_MAPPING_BOOT_SETUP #include "fsl_booke_entry_mapping.S" @@ -238,6 +240,9 @@ set_ivor: bl early_init +#ifdef CONFIG_KASAN + bl kasan_early_init +#endif #ifdef CONFIG_RELOCATABLE mr r3,r30 mr r4,r31 @@ -264,9 +269,6 @@ set_ivor: /* * Decide what sort of machine this is and initialize the MMU. */ -#ifdef CONFIG_KASAN - bl kasan_early_init -#endif mr r3,r30 mr r4,r31 bl machine_init @@ -277,8 +279,8 @@ set_ivor: ori r6, r6, swapper_pg_dir@l lis r5, abatron_pteptrs@h ori r5, r5, abatron_pteptrs@l - lis r4, KERNELBASE@h - ori r4, r4, KERNELBASE@l + lis r3, kernstart_virt_addr@ha + lwz r4, kernstart_virt_addr@l(r3) stw r5, 0(r4) /* Save abatron_pteptrs at a fixed location */ stw r6, 0(r5) @@ -376,6 +378,7 @@ interrupt_base: mfspr r4,SPRN_DEAR /* Grab the DEAR, save it, pass arg2 */ andis. r10,r5,(ESR_ILK|ESR_DLK)@h bne 1f + stw r4, _DEAR(r11) EXC_XFER_LITE(0x0300, handle_page_fault) 1: addi r3,r1,STACK_FRAME_OVERHEAD @@ -1067,7 +1070,12 @@ __secondary_start: mr r5,r25 /* phys kernel start */ rlwinm r5,r5,0,~0x3ffffff /* aligned 64M */ subf r4,r5,r4 /* memstart_addr - phys kernel start */ - li r5,0 /* no device tree */ + lis r7,KERNELBASE@h + ori r7,r7,KERNELBASE@l + cmpw r20,r7 /* if kernstart_virt_addr != KERNELBASE, randomized */ + beq 2f + li r4,0 +2: li r5,0 /* no device tree */ li r6,0 /* not boot cpu */ bl restore_to_as0 @@ -1115,6 +1123,54 @@ __secondary_hold_acknowledge: #endif /* + * Create a 64M tlb by address and entry + * r3 - entry + * r4 - virtual address + * r5/r6 - physical address + */ +_GLOBAL(create_kaslr_tlb_entry) + lis r7,0x1000 /* Set MAS0(TLBSEL) = 1 */ + rlwimi r7,r3,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 /* Write MAS0 */ + + lis r3,(MAS1_VALID|MAS1_IPROT)@h + ori r3,r3,(MAS1_TSIZE(BOOK3E_PAGESZ_64M))@l + mtspr SPRN_MAS1,r3 /* Write MAS1 */ + + lis r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@h + ori r3,r3,MAS2_EPN_MASK(BOOK3E_PAGESZ_64M)@l + and r3,r3,r4 + ori r3,r3,MAS2_M_IF_NEEDED@l + mtspr SPRN_MAS2,r3 /* Write MAS2(EPN) */ + +#ifdef CONFIG_PHYS_64BIT + ori r8,r6,(MAS3_SW|MAS3_SR|MAS3_SX) + mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ + mtspr SPRN_MAS7,r5 +#else + ori r8,r5,(MAS3_SW|MAS3_SR|MAS3_SX) + mtspr SPRN_MAS3,r8 /* Write MAS3(RPN) */ +#endif + + tlbwe /* Write TLB */ + isync + sync + blr + +/* + * Return to the start of the relocated kernel and run again + * r3 - virtual address of fdt + * r4 - entry of the kernel + */ +_GLOBAL(reloc_kernel_entry) + mfmsr r7 + rlwinm r7, r7, 0, ~(MSR_IS | MSR_DS) + + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r7 + rfi + +/* * Create a tlb entry with the same effective and physical address as * the tlb entry used by the current running code. But set the TS to 1. * Then switch to the address space 1. It will return with the r3 set to diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index c8d1fa2e9d53..2462cd7c565c 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -127,15 +127,61 @@ int arch_bp_generic_fields(int type, int *gen_bp_type) } /* + * Watchpoint match range is always doubleword(8 bytes) aligned on + * powerpc. If the given range is crossing doubleword boundary, we + * need to increase the length such that next doubleword also get + * covered. Ex, + * + * address len = 6 bytes + * |=========. + * |------------v--|------v--------| + * | | | | | | | | | | | | | | | | | + * |---------------|---------------| + * <---8 bytes---> + * + * In this case, we should configure hw as: + * start_addr = address & ~HW_BREAKPOINT_ALIGN + * len = 16 bytes + * + * @start_addr and @end_addr are inclusive. + */ +static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw) +{ + u16 max_len = DABR_MAX_LEN; + u16 hw_len; + unsigned long start_addr, end_addr; + + start_addr = hw->address & ~HW_BREAKPOINT_ALIGN; + end_addr = (hw->address + hw->len - 1) | HW_BREAKPOINT_ALIGN; + hw_len = end_addr - start_addr + 1; + + if (dawr_enabled()) { + max_len = DAWR_MAX_LEN; + /* DAWR region can't cross 512 bytes boundary */ + if ((start_addr >> 9) != (end_addr >> 9)) + return -EINVAL; + } else if (IS_ENABLED(CONFIG_PPC_8xx)) { + /* 8xx can setup a range without limitation */ + max_len = U16_MAX; + } + + if (hw_len > max_len) + return -EINVAL; + + hw->hw_len = hw_len; + return 0; +} + +/* * Validate the arch-specific HW Breakpoint register settings */ int hw_breakpoint_arch_parse(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { - int ret = -EINVAL, length_max; + int ret = -EINVAL; - if (!bp) + if (!bp || !attr->bp_len) return ret; hw->type = HW_BRK_TYPE_TRANSLATE; @@ -155,26 +201,10 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, hw->address = attr->bp_addr; hw->len = attr->bp_len; - /* - * Since breakpoint length can be a maximum of HW_BREAKPOINT_LEN(8) - * and breakpoint addresses are aligned to nearest double-word - * HW_BREAKPOINT_ALIGN by rounding off to the lower address, the - * 'symbolsize' should satisfy the check below. - */ if (!ppc_breakpoint_available()) return -ENODEV; - length_max = 8; /* DABR */ - if (dawr_enabled()) { - length_max = 512 ; /* 64 doublewords */ - /* DAWR region can't cross 512 boundary */ - if ((attr->bp_addr >> 9) != - ((attr->bp_addr + attr->bp_len - 1) >> 9)) - return -EINVAL; - } - if (hw->len > - (length_max - (hw->address & HW_BREAKPOINT_ALIGN))) - return -EINVAL; - return 0; + + return hw_breakpoint_validate_len(hw); } /* @@ -195,20 +225,80 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs) tsk->thread.last_hit_ubp = NULL; } +static bool dar_within_range(unsigned long dar, struct arch_hw_breakpoint *info) +{ + return ((info->address <= dar) && (dar - info->address < info->len)); +} + +static bool +dar_range_overlaps(unsigned long dar, int size, struct arch_hw_breakpoint *info) +{ + return ((dar <= info->address + info->len - 1) && + (dar + size - 1 >= info->address)); +} + /* * Handle debug exception notifications. */ +static bool stepping_handler(struct pt_regs *regs, struct perf_event *bp, + struct arch_hw_breakpoint *info) +{ + unsigned int instr = 0; + int ret, type, size; + struct instruction_op op; + unsigned long addr = info->address; + + if (__get_user_inatomic(instr, (unsigned int *)regs->nip)) + goto fail; + + ret = analyse_instr(&op, regs, instr); + type = GETTYPE(op.type); + size = GETSIZE(op.type); + + if (!ret && (type == LARX || type == STCX)) { + printk_ratelimited("Breakpoint hit on instruction that can't be emulated." + " Breakpoint at 0x%lx will be disabled.\n", addr); + goto disable; + } + + /* + * If it's extraneous event, we still need to emulate/single- + * step the instruction, but we don't generate an event. + */ + if (size && !dar_range_overlaps(regs->dar, size, info)) + info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; + + /* Do not emulate user-space instructions, instead single-step them */ + if (user_mode(regs)) { + current->thread.last_hit_ubp = bp; + regs->msr |= MSR_SE; + return false; + } + + if (!emulate_step(regs, instr)) + goto fail; + + return true; + +fail: + /* + * We've failed in reliably handling the hw-breakpoint. Unregister + * it and throw a warning message to let the user know about it. + */ + WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " + "0x%lx will be disabled.", addr); + +disable: + perf_event_disable_inatomic(bp); + return false; +} + int hw_breakpoint_handler(struct die_args *args) { int rc = NOTIFY_STOP; struct perf_event *bp; struct pt_regs *regs = args->regs; -#ifndef CONFIG_PPC_8xx - int stepped = 1; - unsigned int instr; -#endif struct arch_hw_breakpoint *info; - unsigned long dar = regs->dar; /* Disable breakpoints during exception handling */ hw_breakpoint_disable(); @@ -240,43 +330,14 @@ int hw_breakpoint_handler(struct die_args *args) goto out; } - /* - * Verify if dar lies within the address range occupied by the symbol - * being watched to filter extraneous exceptions. If it doesn't, - * we still need to single-step the instruction, but we don't - * generate an event. - */ info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; - if (!((bp->attr.bp_addr <= dar) && - (dar - bp->attr.bp_addr < bp->attr.bp_len))) + if (!dar_within_range(regs->dar, info)) info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; -#ifndef CONFIG_PPC_8xx - /* Do not emulate user-space instructions, instead single-step them */ - if (user_mode(regs)) { - current->thread.last_hit_ubp = bp; - regs->msr |= MSR_SE; + if (!IS_ENABLED(CONFIG_PPC_8xx) && !stepping_handler(regs, bp, info)) goto out; - } - - stepped = 0; - instr = 0; - if (!__get_user_inatomic(instr, (unsigned int *) regs->nip)) - stepped = emulate_step(regs, instr); /* - * emulate_step() could not execute it. We've failed in reliably - * handling the hw-breakpoint. Unregister it and throw a warning - * message to let the user know about it. - */ - if (!stepped) { - WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " - "0x%lx will be disabled.", info->address); - perf_event_disable_inatomic(bp); - goto out; - } -#endif - /* * As a policy, the callback is invoked in a 'trigger-after-execute' * fashion */ diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index a36fd053c3db..422e31d2f5a2 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -77,6 +77,31 @@ void arch_cpu_idle(void) int powersave_nap; +#ifdef CONFIG_PPC_970_NAP +void power4_idle(void) +{ + if (!cpu_has_feature(CPU_FTR_CAN_NAP)) + return; + + if (!powersave_nap) + return; + + if (!prep_irq_for_idle()) + return; + + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + asm volatile("DSSALL ; sync" ::: "memory"); + + power4_idle_nap(); + + /* + * power4_idle_nap returns with interrupts enabled (soft and hard). + * to our caller with interrupts enabled (soft and hard). Our caller + * can cope with either interrupts disabled or enabled upon return. + */ +} +#endif + #ifdef CONFIG_SYSCTL /* * Register the sysctl to set/clear powersave_nap. diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index d32751994a62..22f249b6f58d 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -15,7 +15,9 @@ #include <asm/asm-offsets.h> #include <asm/ppc-opcode.h> #include <asm/cpuidle.h> +#include <asm/thread_info.h> /* TLF_NAPPING */ +#ifdef CONFIG_PPC_P7_NAP /* * Desired PSSCR in r3 * @@ -181,4 +183,22 @@ _GLOBAL(isa206_idle_insn_mayloss) bne 2f IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) 2: IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) +#endif +#ifdef CONFIG_PPC_970_NAP +_GLOBAL(power4_idle_nap) + LOAD_REG_IMMEDIATE(r7, MSR_KERNEL|MSR_EE|MSR_POW) + ld r9,PACA_THREAD_INFO(r13) + ld r8,TI_LOCAL_FLAGS(r9) + ori r8,r8,_TLF_NAPPING + std r8,TI_LOCAL_FLAGS(r9) + /* + * NAPPING bit is set, from this point onward power4_fixup_nap + * will cause exceptions to return to power4_idle_nap_return. + */ +1: sync + isync + mtmsrd r7 + isync + b 1b +#endif diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S deleted file mode 100644 index 33c625329078..000000000000 --- a/arch/powerpc/kernel/idle_power4.S +++ /dev/null @@ -1,83 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file contains the power_save function for 970-family CPUs. - */ - -#include <linux/threads.h> -#include <asm/processor.h> -#include <asm/page.h> -#include <asm/cputable.h> -#include <asm/thread_info.h> -#include <asm/ppc_asm.h> -#include <asm/asm-offsets.h> -#include <asm/irqflags.h> -#include <asm/hw_irq.h> -#include <asm/feature-fixups.h> - -#undef DEBUG - - .text - -_GLOBAL(power4_idle) -BEGIN_FTR_SECTION - blr -END_FTR_SECTION_IFCLR(CPU_FTR_CAN_NAP) - /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - - /* This sequence is similar to prep_irq_for_idle() */ - - /* Hard disable interrupts */ - mfmsr r7 - rldicl r0,r7,48,1 - rotldi r0,r0,16 - mtmsrd r0,1 - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - cmpwi cr0,r0,0 - bne- 2f - - /* - * Soft-enable interrupts. This will make power4_fixup_nap return - * to our caller with interrupts enabled (soft and hard). The caller - * can cope with either interrupts disabled or enabled upon return. - */ -#ifdef CONFIG_TRACE_IRQFLAGS - /* Tell the tracer interrupts are on, because idle responds to them. */ - mflr r0 - std r0,16(r1) - stdu r1,-128(r1) - bl trace_hardirqs_on - addi r1,r1,128 - ld r0,16(r1) - mtlr r0 - mfmsr r7 -#endif /* CONFIG_TRACE_IRQFLAGS */ - - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13) /* we'll hard-enable shortly */ -BEGIN_FTR_SECTION - DSSALL - sync -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - ld r9, PACA_THREAD_INFO(r13) - ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ - ori r8,r8,_TLF_NAPPING /* so when we take an exception */ - std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ - ori r7,r7,MSR_EE - oris r7,r7,MSR_POW@h -1: sync - isync - mtmsrd r7 - isync - b 1b - -2: /* Return if an interrupt had happened while soft disabled */ - /* Set the HARD_DIS flag because interrupts are now hard disabled */ - ori r0,r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) - blr diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c new file mode 100644 index 000000000000..e34116255ced --- /dev/null +++ b/arch/powerpc/kernel/ima_arch.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Nayna Jain + */ + +#include <linux/ima.h> +#include <asm/secure_boot.h> + +bool arch_ima_get_secureboot(void) +{ + return is_ppc_secureboot_enabled(); +} + +/* + * The "secure_rules" are enabled only on "secureboot" enabled systems. + * These rules verify the file signatures against known good values. + * The "appraise_type=imasig|modsig" option allows the known good signature + * to be stored as an xattr or as an appended signature. + * + * To avoid duplicate signature verification as much as possible, the IMA + * policy rule for module appraisal is added only if CONFIG_MODULE_SIG_FORCE + * is not enabled. + */ +static const char *const secure_rules[] = { + "appraise func=KEXEC_KERNEL_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", +#ifndef CONFIG_MODULE_SIG_FORCE + "appraise func=MODULE_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", +#endif + NULL +}; + +/* + * The "trusted_rules" are enabled only on "trustedboot" enabled systems. + * These rules add the kexec kernel image and kernel modules file hashes to + * the IMA measurement list. + */ +static const char *const trusted_rules[] = { + "measure func=KEXEC_KERNEL_CHECK", + "measure func=MODULE_CHECK", + NULL +}; + +/* + * The "secure_and_trusted_rules" contains rules for both the secure boot and + * trusted boot. The "template=ima-modsig" option includes the appended + * signature, when available, in the IMA measurement list. + */ +static const char *const secure_and_trusted_rules[] = { + "measure func=KEXEC_KERNEL_CHECK template=ima-modsig", + "measure func=MODULE_CHECK template=ima-modsig", + "appraise func=KEXEC_KERNEL_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", +#ifndef CONFIG_MODULE_SIG_FORCE + "appraise func=MODULE_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", +#endif + NULL +}; + +/* + * Returns the relevant IMA arch-specific policies based on the system secure + * boot state. + */ +const char *const *arch_get_ima_policy(void) +{ + if (is_ppc_secureboot_enabled()) { + if (IS_ENABLED(CONFIG_MODULE_SIG)) + set_module_sig_enforced(); + + if (is_ppc_trustedboot_enabled()) + return secure_and_trusted_rules; + else + return secure_rules; + } else if (is_ppc_trustedboot_enabled()) { + return trusted_rules; + } + + return NULL; +} diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c deleted file mode 100644 index 720e50e490b6..000000000000 --- a/arch/powerpc/kernel/ima_kexec.c +++ /dev/null @@ -1,219 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (C) 2016 IBM Corporation - * - * Authors: - * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com> - */ - -#include <linux/slab.h> -#include <linux/kexec.h> -#include <linux/of.h> -#include <linux/memblock.h> -#include <linux/libfdt.h> - -static int get_addr_size_cells(int *addr_cells, int *size_cells) -{ - struct device_node *root; - - root = of_find_node_by_path("/"); - if (!root) - return -EINVAL; - - *addr_cells = of_n_addr_cells(root); - *size_cells = of_n_size_cells(root); - - of_node_put(root); - - return 0; -} - -static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr, - size_t *size) -{ - int ret, addr_cells, size_cells; - - ret = get_addr_size_cells(&addr_cells, &size_cells); - if (ret) - return ret; - - if (len < 4 * (addr_cells + size_cells)) - return -ENOENT; - - *addr = of_read_number(prop, addr_cells); - *size = of_read_number(prop + 4 * addr_cells, size_cells); - - return 0; -} - -/** - * ima_get_kexec_buffer - get IMA buffer from the previous kernel - * @addr: On successful return, set to point to the buffer contents. - * @size: On successful return, set to the buffer size. - * - * Return: 0 on success, negative errno on error. - */ -int ima_get_kexec_buffer(void **addr, size_t *size) -{ - int ret, len; - unsigned long tmp_addr; - size_t tmp_size; - const void *prop; - - prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len); - if (!prop) - return -ENOENT; - - ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size); - if (ret) - return ret; - - *addr = __va(tmp_addr); - *size = tmp_size; - - return 0; -} - -/** - * ima_free_kexec_buffer - free memory used by the IMA buffer - */ -int ima_free_kexec_buffer(void) -{ - int ret; - unsigned long addr; - size_t size; - struct property *prop; - - prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL); - if (!prop) - return -ENOENT; - - ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size); - if (ret) - return ret; - - ret = of_remove_property(of_chosen, prop); - if (ret) - return ret; - - return memblock_free(addr, size); - -} - -/** - * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt - * - * The IMA measurement buffer is of no use to a subsequent kernel, so we always - * remove it from the device tree. - */ -void remove_ima_buffer(void *fdt, int chosen_node) -{ - int ret, len; - unsigned long addr; - size_t size; - const void *prop; - - prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len); - if (!prop) - return; - - ret = do_get_kexec_buffer(prop, len, &addr, &size); - fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer"); - if (ret) - return; - - ret = delete_fdt_mem_rsv(fdt, addr, size); - if (!ret) - pr_debug("Removed old IMA buffer reservation.\n"); -} - -#ifdef CONFIG_IMA_KEXEC -/** - * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer - * - * Architectures should use this function to pass on the IMA buffer - * information to the next kernel. - * - * Return: 0 on success, negative errno on error. - */ -int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr, - size_t size) -{ - image->arch.ima_buffer_addr = load_addr; - image->arch.ima_buffer_size = size; - - return 0; -} - -static int write_number(void *p, u64 value, int cells) -{ - if (cells == 1) { - u32 tmp; - - if (value > U32_MAX) - return -EINVAL; - - tmp = cpu_to_be32(value); - memcpy(p, &tmp, sizeof(tmp)); - } else if (cells == 2) { - u64 tmp; - - tmp = cpu_to_be64(value); - memcpy(p, &tmp, sizeof(tmp)); - } else - return -EINVAL; - - return 0; -} - -/** - * setup_ima_buffer - add IMA buffer information to the fdt - * @image: kexec image being loaded. - * @fdt: Flattened device tree for the next kernel. - * @chosen_node: Offset to the chosen node. - * - * Return: 0 on success, or negative errno on error. - */ -int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node) -{ - int ret, addr_cells, size_cells, entry_size; - u8 value[16]; - - remove_ima_buffer(fdt, chosen_node); - if (!image->arch.ima_buffer_size) - return 0; - - ret = get_addr_size_cells(&addr_cells, &size_cells); - if (ret) - return ret; - - entry_size = 4 * (addr_cells + size_cells); - - if (entry_size > sizeof(value)) - return -EINVAL; - - ret = write_number(value, image->arch.ima_buffer_addr, addr_cells); - if (ret) - return ret; - - ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size, - size_cells); - if (ret) - return ret; - - ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value, - entry_size); - if (ret < 0) - return -EINVAL; - - ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr, - image->arch.ima_buffer_size); - if (ret) - return -EINVAL; - - pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n", - image->arch.ima_buffer_addr, image->arch.ima_buffer_size); - - return 0; -} -#endif /* CONFIG_IMA_KEXEC */ diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index fbd2d0007c52..0276bc8c8969 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -149,8 +149,8 @@ static const struct ppc_pci_io iowa_pci_io = { }; #ifdef CONFIG_PPC_INDIRECT_MMIO -static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, - pgprot_t prot, void *caller) +void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, + pgprot_t prot, void *caller) { struct iowa_bus *bus; void __iomem *res = __ioremap_caller(addr, size, prot, caller); @@ -163,20 +163,17 @@ static void __iomem *iowa_ioremap(phys_addr_t addr, unsigned long size, } return res; } -#else /* CONFIG_PPC_INDIRECT_MMIO */ -#define iowa_ioremap NULL #endif /* !CONFIG_PPC_INDIRECT_MMIO */ +bool io_workaround_inited; + /* Enable IO workaround */ static void io_workaround_init(void) { - static int io_workaround_inited; - if (io_workaround_inited) return; ppc_pci_io = iowa_pci_io; - ppc_md.ioremap = iowa_ioremap; - io_workaround_inited = 1; + io_workaround_inited = true; } /* Register new bus to support workaround */ diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 0a67ce9f827e..9704f3f76e63 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -633,11 +633,54 @@ static void iommu_table_clear(struct iommu_table *tbl) #endif } +static void iommu_table_reserve_pages(struct iommu_table *tbl, + unsigned long res_start, unsigned long res_end) +{ + int i; + + WARN_ON_ONCE(res_end < res_start); + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + tbl->it_reserved_start = res_start; + tbl->it_reserved_end = res_end; + + /* Check if res_start..res_end isn't empty and overlaps the table */ + if (res_start && res_end && + (tbl->it_offset + tbl->it_size < res_start || + res_end < tbl->it_offset)) + return; + + for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) + set_bit(i - tbl->it_offset, tbl->it_map); +} + +static void iommu_table_release_pages(struct iommu_table *tbl) +{ + int i; + + /* + * In case we have reserved the first bit, we should not emit + * the warning below. + */ + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) + clear_bit(i - tbl->it_offset, tbl->it_map); +} + /* * Build a iommu_table structure. This contains a bit map which * is used to manage allocation of the tce space. */ -struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) +struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, + unsigned long res_start, unsigned long res_end) { unsigned long sz; static int welcomed = 0; @@ -656,13 +699,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) tbl->it_map = page_address(page); memset(tbl->it_map, 0, sz); - /* - * Reserve page 0 so it will not be used for any mappings. - * This avoids buggy drivers that consider page 0 to be invalid - * to crash the machine or even lose data. - */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + iommu_table_reserve_pages(tbl, res_start, res_end); /* We only split the IOMMU table if we have 1GB or more of space */ if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024)) @@ -714,12 +751,7 @@ static void iommu_table_free(struct kref *kref) return; } - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + iommu_table_release_pages(tbl); /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) @@ -981,29 +1013,32 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl, +extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction) { long ret; unsigned long size = 0; - ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); - + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, false); if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL)) && !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift, &size)) SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); - /* if (unlikely(ret)) - pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", - __func__, hwaddr, entry << tbl->it_page_shift, - hwaddr, ret); */ - return ret; } -EXPORT_SYMBOL_GPL(iommu_tce_xchg); +EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill); + +void iommu_tce_kill(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + if (tbl->it_ops->tce_kill) + tbl->it_ops->tce_kill(tbl, entry, pages, false); +} +EXPORT_SYMBOL_GPL(iommu_tce_kill); int iommu_take_ownership(struct iommu_table *tbl) { @@ -1017,22 +1052,21 @@ int iommu_take_ownership(struct iommu_table *tbl) * requires exchange() callback defined so if it is not * implemented, we disallow taking ownership over the table. */ - if (!tbl->it_ops->exchange) + if (!tbl->it_ops->xchg_no_kill) return -EINVAL; spin_lock_irqsave(&tbl->large_pool.lock, flags); for (i = 0; i < tbl->nr_pools; i++) spin_lock(&tbl->pools[i].lock); - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + iommu_table_release_pages(tbl); if (!bitmap_empty(tbl->it_map, tbl->it_size)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ + iommu_table_reserve_pages(tbl, tbl->it_reserved_start, + tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } @@ -1055,9 +1089,8 @@ void iommu_release_ownership(struct iommu_table *tbl) memset(tbl->it_map, 0, sz); - /* Restore bit#0 set by iommu_init_table() */ - if (tbl->it_offset == 0) - set_bit(0, tbl->it_map); + iommu_table_reserve_pages(tbl, tbl->it_reserved_start, + tbl->it_reserved_end); for (i = 0; i < tbl->nr_pools; i++) spin_unlock(&tbl->pools[i].lock); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5645bc9cbc09..5c9b11878555 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -50,6 +50,7 @@ #include <linux/debugfs.h> #include <linux/of.h> #include <linux/of_irq.h> +#include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/io.h> @@ -619,8 +620,6 @@ void __do_irq(struct pt_regs *regs) trace_irq_entry(regs); - check_stack_overflow(); - /* * Query the platform PIC for the interrupt & ack it. * @@ -652,6 +651,8 @@ void do_IRQ(struct pt_regs *regs) irqsp = hardirq_ctx[raw_smp_processor_id()]; sirqsp = softirq_ctx[raw_smp_processor_id()]; + check_stack_overflow(); + /* Already there ? */ if (unlikely(cursp == irqsp || cursp == sirqsp)) { __do_irq(regs); @@ -664,8 +665,29 @@ void do_IRQ(struct pt_regs *regs) set_irq_regs(old_regs); } +static void *__init alloc_vm_stack(void) +{ + return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, + VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, + 0, NUMA_NO_NODE, (void*)_RET_IP_); +} + +static void __init vmap_irqstack_init(void) +{ + int i; + + for_each_possible_cpu(i) { + softirq_ctx[i] = alloc_vm_stack(); + hardirq_ctx[i] = alloc_vm_stack(); + } +} + + void __init init_IRQ(void) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) + vmap_irqstack_init(); + if (ppc_md.init_IRQ) ppc_md.init_IRQ(); } diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c deleted file mode 100644 index 83cf7b852876..000000000000 --- a/arch/powerpc/kernel/kexec_elf_64.c +++ /dev/null @@ -1,660 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Load ELF vmlinux file for the kexec_file_load syscall. - * - * Copyright (C) 2004 Adam Litke (agl@us.ibm.com) - * Copyright (C) 2004 IBM Corp. - * Copyright (C) 2005 R Sharada (sharada@in.ibm.com) - * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com) - * Copyright (C) 2016 IBM Corporation - * - * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c. - * Heavily modified for the kernel by - * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>. - */ - -#define pr_fmt(fmt) "kexec_elf: " fmt - -#include <linux/elf.h> -#include <linux/kexec.h> -#include <linux/libfdt.h> -#include <linux/module.h> -#include <linux/of_fdt.h> -#include <linux/slab.h> -#include <linux/types.h> - -#define PURGATORY_STACK_SIZE (16 * 1024) - -#define elf_addr_to_cpu elf64_to_cpu - -#ifndef Elf_Rel -#define Elf_Rel Elf64_Rel -#endif /* Elf_Rel */ - -struct elf_info { - /* - * Where the ELF binary contents are kept. - * Memory managed by the user of the struct. - */ - const char *buffer; - - const struct elfhdr *ehdr; - const struct elf_phdr *proghdrs; - struct elf_shdr *sechdrs; -}; - -static inline bool elf_is_elf_file(const struct elfhdr *ehdr) -{ - return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; -} - -static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) -{ - if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) - value = le64_to_cpu(value); - else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) - value = be64_to_cpu(value); - - return value; -} - -static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) -{ - if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) - value = le16_to_cpu(value); - else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) - value = be16_to_cpu(value); - - return value; -} - -static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) -{ - if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) - value = le32_to_cpu(value); - else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) - value = be32_to_cpu(value); - - return value; -} - -/** - * elf_is_ehdr_sane - check that it is safe to use the ELF header - * @buf_len: size of the buffer in which the ELF file is loaded. - */ -static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len) -{ - if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) { - pr_debug("Bad program header size.\n"); - return false; - } else if (ehdr->e_shnum > 0 && - ehdr->e_shentsize != sizeof(struct elf_shdr)) { - pr_debug("Bad section header size.\n"); - return false; - } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT || - ehdr->e_version != EV_CURRENT) { - pr_debug("Unknown ELF version.\n"); - return false; - } - - if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { - size_t phdr_size; - - /* - * e_phnum is at most 65535 so calculating the size of the - * program header cannot overflow. - */ - phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; - - /* Sanity check the program header table location. */ - if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) { - pr_debug("Program headers at invalid location.\n"); - return false; - } else if (ehdr->e_phoff + phdr_size > buf_len) { - pr_debug("Program headers truncated.\n"); - return false; - } - } - - if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { - size_t shdr_size; - - /* - * e_shnum is at most 65536 so calculating - * the size of the section header cannot overflow. - */ - shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum; - - /* Sanity check the section header table location. */ - if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) { - pr_debug("Section headers at invalid location.\n"); - return false; - } else if (ehdr->e_shoff + shdr_size > buf_len) { - pr_debug("Section headers truncated.\n"); - return false; - } - } - - return true; -} - -static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) -{ - struct elfhdr *buf_ehdr; - - if (len < sizeof(*buf_ehdr)) { - pr_debug("Buffer is too small to hold ELF header.\n"); - return -ENOEXEC; - } - - memset(ehdr, 0, sizeof(*ehdr)); - memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident)); - if (!elf_is_elf_file(ehdr)) { - pr_debug("No ELF header magic.\n"); - return -ENOEXEC; - } - - if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) { - pr_debug("Not a supported ELF class.\n"); - return -ENOEXEC; - } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB && - ehdr->e_ident[EI_DATA] != ELFDATA2MSB) { - pr_debug("Not a supported ELF data format.\n"); - return -ENOEXEC; - } - - buf_ehdr = (struct elfhdr *) buf; - if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) { - pr_debug("Bad ELF header size.\n"); - return -ENOEXEC; - } - - ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type); - ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine); - ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version); - ehdr->e_entry = elf_addr_to_cpu(ehdr, buf_ehdr->e_entry); - ehdr->e_phoff = elf_addr_to_cpu(ehdr, buf_ehdr->e_phoff); - ehdr->e_shoff = elf_addr_to_cpu(ehdr, buf_ehdr->e_shoff); - ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags); - ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize); - ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum); - ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize); - ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum); - ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx); - - return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC; -} - -/** - * elf_is_phdr_sane - check that it is safe to use the program header - * @buf_len: size of the buffer in which the ELF file is loaded. - */ -static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len) -{ - - if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) { - pr_debug("ELF segment location wraps around.\n"); - return false; - } else if (phdr->p_offset + phdr->p_filesz > buf_len) { - pr_debug("ELF segment not in file.\n"); - return false; - } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) { - pr_debug("ELF segment address wraps around.\n"); - return false; - } - - return true; -} - -static int elf_read_phdr(const char *buf, size_t len, struct elf_info *elf_info, - int idx) -{ - /* Override the const in proghdrs, we are the ones doing the loading. */ - struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx]; - const char *pbuf; - struct elf_phdr *buf_phdr; - - pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr)); - buf_phdr = (struct elf_phdr *) pbuf; - - phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type); - phdr->p_offset = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_offset); - phdr->p_paddr = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_paddr); - phdr->p_vaddr = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_vaddr); - phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags); - - /* - * The following fields have a type equivalent to Elf_Addr - * both in 32 bit and 64 bit ELF. - */ - phdr->p_filesz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_filesz); - phdr->p_memsz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_memsz); - phdr->p_align = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_align); - - return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC; -} - -/** - * elf_read_phdrs - read the program headers from the buffer - * - * This function assumes that the program header table was checked for sanity. - * Use elf_is_ehdr_sane() if it wasn't. - */ -static int elf_read_phdrs(const char *buf, size_t len, - struct elf_info *elf_info) -{ - size_t phdr_size, i; - const struct elfhdr *ehdr = elf_info->ehdr; - - /* - * e_phnum is at most 65535 so calculating the size of the - * program header cannot overflow. - */ - phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; - - elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL); - if (!elf_info->proghdrs) - return -ENOMEM; - - for (i = 0; i < ehdr->e_phnum; i++) { - int ret; - - ret = elf_read_phdr(buf, len, elf_info, i); - if (ret) { - kfree(elf_info->proghdrs); - elf_info->proghdrs = NULL; - return ret; - } - } - - return 0; -} - -/** - * elf_is_shdr_sane - check that it is safe to use the section header - * @buf_len: size of the buffer in which the ELF file is loaded. - */ -static bool elf_is_shdr_sane(const struct elf_shdr *shdr, size_t buf_len) -{ - bool size_ok; - - /* SHT_NULL headers have undefined values, so we can't check them. */ - if (shdr->sh_type == SHT_NULL) - return true; - - /* Now verify sh_entsize */ - switch (shdr->sh_type) { - case SHT_SYMTAB: - size_ok = shdr->sh_entsize == sizeof(Elf_Sym); - break; - case SHT_RELA: - size_ok = shdr->sh_entsize == sizeof(Elf_Rela); - break; - case SHT_DYNAMIC: - size_ok = shdr->sh_entsize == sizeof(Elf_Dyn); - break; - case SHT_REL: - size_ok = shdr->sh_entsize == sizeof(Elf_Rel); - break; - case SHT_NOTE: - case SHT_PROGBITS: - case SHT_HASH: - case SHT_NOBITS: - default: - /* - * This is a section whose entsize requirements - * I don't care about. If I don't know about - * the section I can't care about it's entsize - * requirements. - */ - size_ok = true; - break; - } - - if (!size_ok) { - pr_debug("ELF section with wrong entry size.\n"); - return false; - } else if (shdr->sh_addr + shdr->sh_size < shdr->sh_addr) { - pr_debug("ELF section address wraps around.\n"); - return false; - } - - if (shdr->sh_type != SHT_NOBITS) { - if (shdr->sh_offset + shdr->sh_size < shdr->sh_offset) { - pr_debug("ELF section location wraps around.\n"); - return false; - } else if (shdr->sh_offset + shdr->sh_size > buf_len) { - pr_debug("ELF section not in file.\n"); - return false; - } - } - - return true; -} - -static int elf_read_shdr(const char *buf, size_t len, struct elf_info *elf_info, - int idx) -{ - struct elf_shdr *shdr = &elf_info->sechdrs[idx]; - const struct elfhdr *ehdr = elf_info->ehdr; - const char *sbuf; - struct elf_shdr *buf_shdr; - - sbuf = buf + ehdr->e_shoff + idx * sizeof(*buf_shdr); - buf_shdr = (struct elf_shdr *) sbuf; - - shdr->sh_name = elf32_to_cpu(ehdr, buf_shdr->sh_name); - shdr->sh_type = elf32_to_cpu(ehdr, buf_shdr->sh_type); - shdr->sh_addr = elf_addr_to_cpu(ehdr, buf_shdr->sh_addr); - shdr->sh_offset = elf_addr_to_cpu(ehdr, buf_shdr->sh_offset); - shdr->sh_link = elf32_to_cpu(ehdr, buf_shdr->sh_link); - shdr->sh_info = elf32_to_cpu(ehdr, buf_shdr->sh_info); - - /* - * The following fields have a type equivalent to Elf_Addr - * both in 32 bit and 64 bit ELF. - */ - shdr->sh_flags = elf_addr_to_cpu(ehdr, buf_shdr->sh_flags); - shdr->sh_size = elf_addr_to_cpu(ehdr, buf_shdr->sh_size); - shdr->sh_addralign = elf_addr_to_cpu(ehdr, buf_shdr->sh_addralign); - shdr->sh_entsize = elf_addr_to_cpu(ehdr, buf_shdr->sh_entsize); - - return elf_is_shdr_sane(shdr, len) ? 0 : -ENOEXEC; -} - -/** - * elf_read_shdrs - read the section headers from the buffer - * - * This function assumes that the section header table was checked for sanity. - * Use elf_is_ehdr_sane() if it wasn't. - */ -static int elf_read_shdrs(const char *buf, size_t len, - struct elf_info *elf_info) -{ - size_t shdr_size, i; - - /* - * e_shnum is at most 65536 so calculating - * the size of the section header cannot overflow. - */ - shdr_size = sizeof(struct elf_shdr) * elf_info->ehdr->e_shnum; - - elf_info->sechdrs = kzalloc(shdr_size, GFP_KERNEL); - if (!elf_info->sechdrs) - return -ENOMEM; - - for (i = 0; i < elf_info->ehdr->e_shnum; i++) { - int ret; - - ret = elf_read_shdr(buf, len, elf_info, i); - if (ret) { - kfree(elf_info->sechdrs); - elf_info->sechdrs = NULL; - return ret; - } - } - - return 0; -} - -/** - * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info - * @buf: Buffer to read ELF file from. - * @len: Size of @buf. - * @ehdr: Pointer to existing struct which will be populated. - * @elf_info: Pointer to existing struct which will be populated. - * - * This function allows reading ELF files with different byte order than - * the kernel, byte-swapping the fields as needed. - * - * Return: - * On success returns 0, and the caller should call elf_free_info(elf_info) to - * free the memory allocated for the section and program headers. - */ -int elf_read_from_buffer(const char *buf, size_t len, struct elfhdr *ehdr, - struct elf_info *elf_info) -{ - int ret; - - ret = elf_read_ehdr(buf, len, ehdr); - if (ret) - return ret; - - elf_info->buffer = buf; - elf_info->ehdr = ehdr; - if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { - ret = elf_read_phdrs(buf, len, elf_info); - if (ret) - return ret; - } - if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { - ret = elf_read_shdrs(buf, len, elf_info); - if (ret) { - kfree(elf_info->proghdrs); - return ret; - } - } - - return 0; -} - -/** - * elf_free_info - free memory allocated by elf_read_from_buffer - */ -void elf_free_info(struct elf_info *elf_info) -{ - kfree(elf_info->proghdrs); - kfree(elf_info->sechdrs); - memset(elf_info, 0, sizeof(*elf_info)); -} -/** - * build_elf_exec_info - read ELF executable and check that we can use it - */ -static int build_elf_exec_info(const char *buf, size_t len, struct elfhdr *ehdr, - struct elf_info *elf_info) -{ - int i; - int ret; - - ret = elf_read_from_buffer(buf, len, ehdr, elf_info); - if (ret) - return ret; - - /* Big endian vmlinux has type ET_DYN. */ - if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) { - pr_err("Not an ELF executable.\n"); - goto error; - } else if (!elf_info->proghdrs) { - pr_err("No ELF program header.\n"); - goto error; - } - - for (i = 0; i < ehdr->e_phnum; i++) { - /* - * Kexec does not support loading interpreters. - * In addition this check keeps us from attempting - * to kexec ordinay executables. - */ - if (elf_info->proghdrs[i].p_type == PT_INTERP) { - pr_err("Requires an ELF interpreter.\n"); - goto error; - } - } - - return 0; -error: - elf_free_info(elf_info); - return -ENOEXEC; -} - -static int elf64_probe(const char *buf, unsigned long len) -{ - struct elfhdr ehdr; - struct elf_info elf_info; - int ret; - - ret = build_elf_exec_info(buf, len, &ehdr, &elf_info); - if (ret) - return ret; - - elf_free_info(&elf_info); - - return elf_check_arch(&ehdr) ? 0 : -ENOEXEC; -} - -/** - * elf_exec_load - load ELF executable image - * @lowest_load_addr: On return, will be the address where the first PT_LOAD - * section will be loaded in memory. - * - * Return: - * 0 on success, negative value on failure. - */ -static int elf_exec_load(struct kimage *image, struct elfhdr *ehdr, - struct elf_info *elf_info, - unsigned long *lowest_load_addr) -{ - unsigned long base = 0, lowest_addr = UINT_MAX; - int ret; - size_t i; - struct kexec_buf kbuf = { .image = image, .buf_max = ppc64_rma_size, - .top_down = false }; - - /* Read in the PT_LOAD segments. */ - for (i = 0; i < ehdr->e_phnum; i++) { - unsigned long load_addr; - size_t size; - const struct elf_phdr *phdr; - - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - size = phdr->p_filesz; - if (size > phdr->p_memsz) - size = phdr->p_memsz; - - kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; - kbuf.bufsz = size; - kbuf.memsz = phdr->p_memsz; - kbuf.buf_align = phdr->p_align; - kbuf.buf_min = phdr->p_paddr + base; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - load_addr = kbuf.mem; - - if (load_addr < lowest_addr) - lowest_addr = load_addr; - } - - /* Update entry point to reflect new load address. */ - ehdr->e_entry += base; - - *lowest_load_addr = lowest_addr; - ret = 0; - out: - return ret; -} - -static void *elf64_load(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len) -{ - int ret; - unsigned int fdt_size; - unsigned long kernel_load_addr; - unsigned long initrd_load_addr = 0, fdt_load_addr; - void *fdt; - const void *slave_code; - struct elfhdr ehdr; - struct elf_info elf_info; - struct kexec_buf kbuf = { .image = image, .buf_min = 0, - .buf_max = ppc64_rma_size }; - struct kexec_buf pbuf = { .image = image, .buf_min = 0, - .buf_max = ppc64_rma_size, .top_down = true, - .mem = KEXEC_BUF_MEM_UNKNOWN }; - - ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info); - if (ret) - goto out; - - ret = elf_exec_load(image, &ehdr, &elf_info, &kernel_load_addr); - if (ret) - goto out; - - pr_debug("Loaded the kernel at 0x%lx\n", kernel_load_addr); - - ret = kexec_load_purgatory(image, &pbuf); - if (ret) { - pr_err("Loading purgatory failed.\n"); - goto out; - } - - pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); - - if (initrd != NULL) { - kbuf.buffer = initrd; - kbuf.bufsz = kbuf.memsz = initrd_len; - kbuf.buf_align = PAGE_SIZE; - kbuf.top_down = false; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - initrd_load_addr = kbuf.mem; - - pr_debug("Loaded initrd at 0x%lx\n", initrd_load_addr); - } - - fdt_size = fdt_totalsize(initial_boot_params) * 2; - fdt = kmalloc(fdt_size, GFP_KERNEL); - if (!fdt) { - pr_err("Not enough memory for the device tree.\n"); - ret = -ENOMEM; - goto out; - } - ret = fdt_open_into(initial_boot_params, fdt, fdt_size); - if (ret < 0) { - pr_err("Error setting up the new device tree.\n"); - ret = -EINVAL; - goto out; - } - - ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline); - if (ret) - goto out; - - fdt_pack(fdt); - - kbuf.buffer = fdt; - kbuf.bufsz = kbuf.memsz = fdt_size; - kbuf.buf_align = PAGE_SIZE; - kbuf.top_down = true; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - fdt_load_addr = kbuf.mem; - - pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr); - - slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset; - ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr, - fdt_load_addr); - if (ret) - pr_err("Error setting up the purgatory.\n"); - -out: - elf_free_info(&elf_info); - - /* Make kimage_file_post_load_cleanup free the fdt buffer for us. */ - return ret ? ERR_PTR(ret) : fdt; -} - -const struct kexec_file_ops kexec_elf64_ops = { - .probe = elf64_probe, - .load = elf64_load, -}; diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index b7b3a5e4e224..617eba82531c 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -64,16 +64,17 @@ #define KVM_INST_MTSRIN 0x7c0001e4 static bool kvm_patching_worked = true; -char kvm_tmp[1024 * 1024]; +extern char kvm_tmp[]; +extern char kvm_tmp_end[]; static int kvm_tmp_index; -static inline void kvm_patch_ins(u32 *inst, u32 new_inst) +static void __init kvm_patch_ins(u32 *inst, u32 new_inst) { *inst = new_inst; flush_icache_range((ulong)inst, (ulong)inst + 4); } -static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); @@ -82,7 +83,7 @@ static void kvm_patch_ins_ll(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc)); @@ -91,12 +92,12 @@ static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt) { kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff)); } -static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_std(u32 *inst, long addr, u32 rt) { #ifdef CONFIG_64BIT kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc)); @@ -105,17 +106,17 @@ static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt) #endif } -static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) +static void __init kvm_patch_ins_stw(u32 *inst, long addr, u32 rt) { kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc)); } -static void kvm_patch_ins_nop(u32 *inst) +static void __init kvm_patch_ins_nop(u32 *inst) { kvm_patch_ins(inst, KVM_INST_NOP); } -static void kvm_patch_ins_b(u32 *inst, int addr) +static void __init kvm_patch_ins_b(u32 *inst, int addr) { #if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC_BOOK3S) /* On relocatable kernels interrupts handlers and our code @@ -128,11 +129,11 @@ static void kvm_patch_ins_b(u32 *inst, int addr) kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK)); } -static u32 *kvm_alloc(int len) +static u32 * __init kvm_alloc(int len) { u32 *p; - if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) { + if ((kvm_tmp_index + len) > (kvm_tmp_end - kvm_tmp)) { printk(KERN_ERR "KVM: No more space (%d + %d)\n", kvm_tmp_index, len); kvm_patching_worked = false; @@ -151,7 +152,7 @@ extern u32 kvm_emulate_mtmsrd_orig_ins_offs; extern u32 kvm_emulate_mtmsrd_len; extern u32 kvm_emulate_mtmsrd[]; -static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) +static void __init kvm_patch_ins_mtmsrd(u32 *inst, u32 rt) { u32 *p; int distance_start; @@ -204,7 +205,7 @@ extern u32 kvm_emulate_mtmsr_orig_ins_offs; extern u32 kvm_emulate_mtmsr_len; extern u32 kvm_emulate_mtmsr[]; -static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) +static void __init kvm_patch_ins_mtmsr(u32 *inst, u32 rt) { u32 *p; int distance_start; @@ -265,7 +266,7 @@ extern u32 kvm_emulate_wrtee_orig_ins_offs; extern u32 kvm_emulate_wrtee_len; extern u32 kvm_emulate_wrtee[]; -static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) +static void __init kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) { u32 *p; int distance_start; @@ -322,7 +323,7 @@ extern u32 kvm_emulate_wrteei_0_branch_offs; extern u32 kvm_emulate_wrteei_0_len; extern u32 kvm_emulate_wrteei_0[]; -static void kvm_patch_ins_wrteei_0(u32 *inst) +static void __init kvm_patch_ins_wrteei_0(u32 *inst) { u32 *p; int distance_start; @@ -363,7 +364,7 @@ extern u32 kvm_emulate_mtsrin_orig_ins_offs; extern u32 kvm_emulate_mtsrin_len; extern u32 kvm_emulate_mtsrin[]; -static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) +static void __init kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) { u32 *p; int distance_start; @@ -399,7 +400,7 @@ static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb) #endif -static void kvm_map_magic_page(void *data) +static void __init kvm_map_magic_page(void *data) { u32 *features = data; @@ -414,7 +415,7 @@ static void kvm_map_magic_page(void *data) *features = out[0]; } -static void kvm_check_ins(u32 *inst, u32 features) +static void __init kvm_check_ins(u32 *inst, u32 features) { u32 _inst = *inst; u32 inst_no_rt = _inst & ~KVM_MASK_RT; @@ -658,7 +659,7 @@ static void kvm_check_ins(u32 *inst, u32 features) extern u32 kvm_template_start[]; extern u32 kvm_template_end[]; -static void kvm_use_magic_page(void) +static void __init kvm_use_magic_page(void) { u32 *p; u32 *start, *end; @@ -699,25 +700,13 @@ static void kvm_use_magic_page(void) kvm_patching_worked ? "worked" : "failed"); } -static __init void kvm_free_tmp(void) -{ - /* - * Inform kmemleak about the hole in the .bss section since the - * corresponding pages will be unmapped with DEBUG_PAGEALLOC=y. - */ - kmemleak_free_part(&kvm_tmp[kvm_tmp_index], - ARRAY_SIZE(kvm_tmp) - kvm_tmp_index); - free_reserved_area(&kvm_tmp[kvm_tmp_index], - &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); -} - static int __init kvm_guest_init(void) { if (!kvm_para_available()) - goto free_tmp; + return 0; if (!epapr_paravirt_enabled) - goto free_tmp; + return 0; if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE)) kvm_use_magic_page(); @@ -727,9 +716,6 @@ static int __init kvm_guest_init(void) powersave_nap = 1; #endif -free_tmp: - kvm_free_tmp(); - return 0; } diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index eb2568f583ae..7af6f8b50c5d 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -192,6 +192,8 @@ kvm_emulate_mtmsr_orig_ins_offs: kvm_emulate_mtmsr_len: .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 +#ifdef CONFIG_BOOKE + /* also used for wrteei 1 */ .global kvm_emulate_wrtee kvm_emulate_wrtee: @@ -285,6 +287,10 @@ kvm_emulate_wrteei_0_branch_offs: kvm_emulate_wrteei_0_len: .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 +#endif /* CONFIG_BOOKE */ + +#ifdef CONFIG_PPC_BOOK3S_32 + .global kvm_emulate_mtsrin kvm_emulate_mtsrin: @@ -334,5 +340,15 @@ kvm_emulate_mtsrin_orig_ins_offs: kvm_emulate_mtsrin_len: .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 +#endif /* CONFIG_PPC_BOOK3S_32 */ + + .balign 4 + .global kvm_tmp +kvm_tmp: + .space (64 * 1024) + +.global kvm_tmp_end +kvm_tmp_end: + .global kvm_template_end kvm_template_end: diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 7cea5978f21f..f061e06e9f51 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -479,8 +479,10 @@ static void __init fixup_port_irq(int index, port->irq = virq; #ifdef CONFIG_SERIAL_8250_FSL - if (of_device_is_compatible(np, "fsl,ns16550")) + if (of_device_is_compatible(np, "fsl,ns16550")) { port->handle_irq = fsl8250_handle_irq; + port->has_sysrq = IS_ENABLED(CONFIG_SERIAL_8250_CONSOLE); + } #endif } diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c deleted file mode 100644 index c4ed328a7b96..000000000000 --- a/arch/powerpc/kernel/machine_kexec.c +++ /dev/null @@ -1,279 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Code to handle transition of Linux booting another kernel. - * - * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> - * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz - * Copyright (C) 2005 IBM Corporation. - */ - -#include <linux/kexec.h> -#include <linux/reboot.h> -#include <linux/threads.h> -#include <linux/memblock.h> -#include <linux/of.h> -#include <linux/irq.h> -#include <linux/ftrace.h> - -#include <asm/kdump.h> -#include <asm/machdep.h> -#include <asm/pgalloc.h> -#include <asm/prom.h> -#include <asm/sections.h> - -void machine_kexec_mask_interrupts(void) { - unsigned int i; - struct irq_desc *desc; - - for_each_irq_desc(i, desc) { - struct irq_chip *chip; - - chip = irq_desc_get_chip(desc); - if (!chip) - continue; - - if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) - chip->irq_eoi(&desc->irq_data); - - if (chip->irq_mask) - chip->irq_mask(&desc->irq_data); - - if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) - chip->irq_disable(&desc->irq_data); - } -} - -void machine_crash_shutdown(struct pt_regs *regs) -{ - default_machine_crash_shutdown(regs); -} - -/* - * Do what every setup is needed on image and the - * reboot code buffer to allow us to avoid allocations - * later. - */ -int machine_kexec_prepare(struct kimage *image) -{ - if (ppc_md.machine_kexec_prepare) - return ppc_md.machine_kexec_prepare(image); - else - return default_machine_kexec_prepare(image); -} - -void machine_kexec_cleanup(struct kimage *image) -{ -} - -void arch_crash_save_vmcoreinfo(void) -{ - -#ifdef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP) - VMCOREINFO_SYMBOL(vmemmap_list); - VMCOREINFO_SYMBOL(mmu_vmemmap_psize); - VMCOREINFO_SYMBOL(mmu_psize_defs); - VMCOREINFO_STRUCT_SIZE(vmemmap_backing); - VMCOREINFO_OFFSET(vmemmap_backing, list); - VMCOREINFO_OFFSET(vmemmap_backing, phys); - VMCOREINFO_OFFSET(vmemmap_backing, virt_addr); - VMCOREINFO_STRUCT_SIZE(mmu_psize_def); - VMCOREINFO_OFFSET(mmu_psize_def, shift); -#endif -} - -/* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. - */ -void machine_kexec(struct kimage *image) -{ - int save_ftrace_enabled; - - save_ftrace_enabled = __ftrace_enabled_save(); - this_cpu_disable_ftrace(); - - if (ppc_md.machine_kexec) - ppc_md.machine_kexec(image); - else - default_machine_kexec(image); - - this_cpu_enable_ftrace(); - __ftrace_enabled_restore(save_ftrace_enabled); - - /* Fall back to normal restart if we're still alive. */ - machine_restart(NULL); - for(;;); -} - -void __init reserve_crashkernel(void) -{ - unsigned long long crash_size, crash_base; - int ret; - - /* use common parsing */ - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), - &crash_size, &crash_base); - if (ret == 0 && crash_size > 0) { - crashk_res.start = crash_base; - crashk_res.end = crash_base + crash_size - 1; - } - - if (crashk_res.end == crashk_res.start) { - crashk_res.start = crashk_res.end = 0; - return; - } - - /* We might have got these values via the command line or the - * device tree, either way sanitise them now. */ - - crash_size = resource_size(&crashk_res); - -#ifndef CONFIG_NONSTATIC_KERNEL - if (crashk_res.start != KDUMP_KERNELBASE) - printk("Crash kernel location must be 0x%x\n", - KDUMP_KERNELBASE); - - crashk_res.start = KDUMP_KERNELBASE; -#else - if (!crashk_res.start) { -#ifdef CONFIG_PPC64 - /* - * On 64bit we split the RMO in half but cap it at half of - * a small SLB (128MB) since the crash kernel needs to place - * itself and some stacks to be in the first segment. - */ - crashk_res.start = min(0x8000000ULL, (ppc64_rma_size / 2)); -#else - crashk_res.start = KDUMP_KERNELBASE; -#endif - } - - crash_base = PAGE_ALIGN(crashk_res.start); - if (crash_base != crashk_res.start) { - printk("Crash kernel base must be aligned to 0x%lx\n", - PAGE_SIZE); - crashk_res.start = crash_base; - } - -#endif - crash_size = PAGE_ALIGN(crash_size); - crashk_res.end = crashk_res.start + crash_size - 1; - - /* The crash region must not overlap the current kernel */ - if (overlaps_crashkernel(__pa(_stext), _end - _stext)) { - printk(KERN_WARNING - "Crash kernel can not overlap current kernel\n"); - crashk_res.start = crashk_res.end = 0; - return; - } - - /* Crash kernel trumps memory limit */ - if (memory_limit && memory_limit <= crashk_res.end) { - memory_limit = crashk_res.end + 1; - printk("Adjusted memory limit for crashkernel, now 0x%llx\n", - memory_limit); - } - - printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " - "for crashkernel (System RAM: %ldMB)\n", - (unsigned long)(crash_size >> 20), - (unsigned long)(crashk_res.start >> 20), - (unsigned long)(memblock_phys_mem_size() >> 20)); - - if (!memblock_is_region_memory(crashk_res.start, crash_size) || - memblock_reserve(crashk_res.start, crash_size)) { - pr_err("Failed to reserve memory for crashkernel!\n"); - crashk_res.start = crashk_res.end = 0; - return; - } -} - -int overlaps_crashkernel(unsigned long start, unsigned long size) -{ - return (start + size) > crashk_res.start && start <= crashk_res.end; -} - -/* Values we need to export to the second kernel via the device tree. */ -static phys_addr_t kernel_end; -static phys_addr_t crashk_base; -static phys_addr_t crashk_size; -static unsigned long long mem_limit; - -static struct property kernel_end_prop = { - .name = "linux,kernel-end", - .length = sizeof(phys_addr_t), - .value = &kernel_end, -}; - -static struct property crashk_base_prop = { - .name = "linux,crashkernel-base", - .length = sizeof(phys_addr_t), - .value = &crashk_base -}; - -static struct property crashk_size_prop = { - .name = "linux,crashkernel-size", - .length = sizeof(phys_addr_t), - .value = &crashk_size, -}; - -static struct property memory_limit_prop = { - .name = "linux,memory-limit", - .length = sizeof(unsigned long long), - .value = &mem_limit, -}; - -#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) - -static void __init export_crashk_values(struct device_node *node) -{ - /* There might be existing crash kernel properties, but we can't - * be sure what's in them, so remove them. */ - of_remove_property(node, of_find_property(node, - "linux,crashkernel-base", NULL)); - of_remove_property(node, of_find_property(node, - "linux,crashkernel-size", NULL)); - - if (crashk_res.start != 0) { - crashk_base = cpu_to_be_ulong(crashk_res.start), - of_add_property(node, &crashk_base_prop); - crashk_size = cpu_to_be_ulong(resource_size(&crashk_res)); - of_add_property(node, &crashk_size_prop); - } - - /* - * memory_limit is required by the kexec-tools to limit the - * crash regions to the actual memory used. - */ - mem_limit = cpu_to_be_ulong(memory_limit); - of_update_property(node, &memory_limit_prop); -} - -static int __init kexec_setup(void) -{ - struct device_node *node; - - node = of_find_node_by_path("/chosen"); - if (!node) - return -ENOENT; - - /* remove any stale properties so ours can be found */ - of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); - - /* information needed by userspace when using default_machine_kexec */ - kernel_end = cpu_to_be_ulong(__pa(_end)); - of_add_property(node, &kernel_end_prop); - - export_crashk_values(node); - - of_node_put(node); - return 0; -} -late_initcall(kexec_setup); diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c deleted file mode 100644 index bf9f1f906d64..000000000000 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * PPC32 code to handle Linux booting another kernel. - * - * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> - * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz - * Copyright (C) 2005 IBM Corporation. - */ - -#include <linux/kexec.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <asm/cacheflush.h> -#include <asm/hw_irq.h> -#include <asm/io.h> - -typedef void (*relocate_new_kernel_t)( - unsigned long indirection_page, - unsigned long reboot_code_buffer, - unsigned long start_address) __noreturn; - -/* - * This is a generic machine_kexec function suitable at least for - * non-OpenFirmware embedded platforms. - * It merely copies the image relocation code to the control page and - * jumps to it. - * A platform specific function may just call this one. - */ -void default_machine_kexec(struct kimage *image) -{ - extern const unsigned int relocate_new_kernel_size; - unsigned long page_list; - unsigned long reboot_code_buffer, reboot_code_buffer_phys; - relocate_new_kernel_t rnk; - - /* Interrupts aren't acceptable while we reboot */ - local_irq_disable(); - - /* mask each interrupt so we are in a more sane state for the - * kexec kernel */ - machine_kexec_mask_interrupts(); - - page_list = image->head; - - /* we need both effective and real address here */ - reboot_code_buffer = - (unsigned long)page_address(image->control_code_page); - reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer); - - /* copy our kernel relocation code to the control code page */ - memcpy((void *)reboot_code_buffer, relocate_new_kernel, - relocate_new_kernel_size); - - flush_icache_range(reboot_code_buffer, - reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); - printk(KERN_INFO "Bye!\n"); - - if (!IS_ENABLED(CONFIG_FSL_BOOKE) && !IS_ENABLED(CONFIG_44x)) - relocate_new_kernel(page_list, reboot_code_buffer_phys, image->start); - - /* now call it */ - rnk = (relocate_new_kernel_t) reboot_code_buffer; - (*rnk)(page_list, reboot_code_buffer_phys, image->start); -} - -int default_machine_kexec_prepare(struct kimage *image) -{ - return 0; -} diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c deleted file mode 100644 index 18481b0e2788..000000000000 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ /dev/null @@ -1,408 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * PPC64 code to handle Linux booting another kernel. - * - * Copyright (C) 2004-2005, IBM Corp. - * - * Created by: Milton D Miller II - */ - - -#include <linux/kexec.h> -#include <linux/smp.h> -#include <linux/thread_info.h> -#include <linux/init_task.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/cpu.h> -#include <linux/hardirq.h> - -#include <asm/page.h> -#include <asm/current.h> -#include <asm/machdep.h> -#include <asm/cacheflush.h> -#include <asm/firmware.h> -#include <asm/paca.h> -#include <asm/mmu.h> -#include <asm/sections.h> /* _end */ -#include <asm/prom.h> -#include <asm/smp.h> -#include <asm/hw_breakpoint.h> -#include <asm/asm-prototypes.h> - -int default_machine_kexec_prepare(struct kimage *image) -{ - int i; - unsigned long begin, end; /* limits of segment */ - unsigned long low, high; /* limits of blocked memory range */ - struct device_node *node; - const unsigned long *basep; - const unsigned int *sizep; - - /* - * Since we use the kernel fault handlers and paging code to - * handle the virtual mode, we must make sure no destination - * overlaps kernel static data or bss. - */ - for (i = 0; i < image->nr_segments; i++) - if (image->segment[i].mem < __pa(_end)) - return -ETXTBSY; - - /* We also should not overwrite the tce tables */ - for_each_node_by_type(node, "pci") { - basep = of_get_property(node, "linux,tce-base", NULL); - sizep = of_get_property(node, "linux,tce-size", NULL); - if (basep == NULL || sizep == NULL) - continue; - - low = *basep; - high = low + (*sizep); - - for (i = 0; i < image->nr_segments; i++) { - begin = image->segment[i].mem; - end = begin + image->segment[i].memsz; - - if ((begin < high) && (end > low)) - return -ETXTBSY; - } - } - - return 0; -} - -static void copy_segments(unsigned long ind) -{ - unsigned long entry; - unsigned long *ptr; - void *dest; - void *addr; - - /* - * We rely on kexec_load to create a lists that properly - * initializes these pointers before they are used. - * We will still crash if the list is wrong, but at least - * the compiler will be quiet. - */ - ptr = NULL; - dest = NULL; - - for (entry = ind; !(entry & IND_DONE); entry = *ptr++) { - addr = __va(entry & PAGE_MASK); - - switch (entry & IND_FLAGS) { - case IND_DESTINATION: - dest = addr; - break; - case IND_INDIRECTION: - ptr = addr; - break; - case IND_SOURCE: - copy_page(dest, addr); - dest += PAGE_SIZE; - } - } -} - -void kexec_copy_flush(struct kimage *image) -{ - long i, nr_segments = image->nr_segments; - struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; - - /* save the ranges on the stack to efficiently flush the icache */ - memcpy(ranges, image->segment, sizeof(ranges)); - - /* - * After this call we may not use anything allocated in dynamic - * memory, including *image. - * - * Only globals and the stack are allowed. - */ - copy_segments(image->head); - - /* - * we need to clear the icache for all dest pages sometime, - * including ones that were in place on the original copy - */ - for (i = 0; i < nr_segments; i++) - flush_icache_range((unsigned long)__va(ranges[i].mem), - (unsigned long)__va(ranges[i].mem + ranges[i].memsz)); -} - -#ifdef CONFIG_SMP - -static int kexec_all_irq_disabled = 0; - -static void kexec_smp_down(void *arg) -{ - local_irq_disable(); - hard_irq_disable(); - - mb(); /* make sure our irqs are disabled before we say they are */ - get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; - while(kexec_all_irq_disabled == 0) - cpu_relax(); - mb(); /* make sure all irqs are disabled before this */ - hw_breakpoint_disable(); - /* - * Now every CPU has IRQs off, we can clear out any pending - * IPIs and be sure that no more will come in after this. - */ - if (ppc_md.kexec_cpu_down) - ppc_md.kexec_cpu_down(0, 1); - - kexec_smp_wait(); - /* NOTREACHED */ -} - -static void kexec_prepare_cpus_wait(int wait_state) -{ - int my_cpu, i, notified=-1; - - hw_breakpoint_disable(); - my_cpu = get_cpu(); - /* Make sure each CPU has at least made it to the state we need. - * - * FIXME: There is a (slim) chance of a problem if not all of the CPUs - * are correctly onlined. If somehow we start a CPU on boot with RTAS - * start-cpu, but somehow that CPU doesn't write callin_cpu_map[] in - * time, the boot CPU will timeout. If it does eventually execute - * stuff, the secondary will start up (paca_ptrs[]->cpu_start was - * written) and get into a peculiar state. - * If the platform supports smp_ops->take_timebase(), the secondary CPU - * will probably be spinning in there. If not (i.e. pseries), the - * secondary will continue on and try to online itself/idle/etc. If it - * survives that, we need to find these - * possible-but-not-online-but-should-be CPUs and chaperone them into - * kexec_smp_wait(). - */ - for_each_online_cpu(i) { - if (i == my_cpu) - continue; - - while (paca_ptrs[i]->kexec_state < wait_state) { - barrier(); - if (i != notified) { - printk(KERN_INFO "kexec: waiting for cpu %d " - "(physical %d) to enter %i state\n", - i, paca_ptrs[i]->hw_cpu_id, wait_state); - notified = i; - } - } - } - mb(); -} - -/* - * We need to make sure each present CPU is online. The next kernel will scan - * the device tree and assume primary threads are online and query secondary - * threads via RTAS to online them if required. If we don't online primary - * threads, they will be stuck. However, we also online secondary threads as we - * may be using 'cede offline'. In this case RTAS doesn't see the secondary - * threads as offline -- and again, these CPUs will be stuck. - * - * So, we online all CPUs that should be running, including secondary threads. - */ -static void wake_offline_cpus(void) -{ - int cpu = 0; - - for_each_present_cpu(cpu) { - if (!cpu_online(cpu)) { - printk(KERN_INFO "kexec: Waking offline cpu %d.\n", - cpu); - WARN_ON(cpu_up(cpu)); - } - } -} - -static void kexec_prepare_cpus(void) -{ - wake_offline_cpus(); - smp_call_function(kexec_smp_down, NULL, /* wait */0); - local_irq_disable(); - hard_irq_disable(); - - mb(); /* make sure IRQs are disabled before we say they are */ - get_paca()->kexec_state = KEXEC_STATE_IRQS_OFF; - - kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF); - /* we are sure every CPU has IRQs off at this point */ - kexec_all_irq_disabled = 1; - - /* - * Before removing MMU mappings make sure all CPUs have entered real - * mode: - */ - kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE); - - /* after we tell the others to go down */ - if (ppc_md.kexec_cpu_down) - ppc_md.kexec_cpu_down(0, 0); - - put_cpu(); -} - -#else /* ! SMP */ - -static void kexec_prepare_cpus(void) -{ - /* - * move the secondarys to us so that we can copy - * the new kernel 0-0x100 safely - * - * do this if kexec in setup.c ? - * - * We need to release the cpus if we are ever going from an - * UP to an SMP kernel. - */ - smp_release_cpus(); - if (ppc_md.kexec_cpu_down) - ppc_md.kexec_cpu_down(0, 0); - local_irq_disable(); - hard_irq_disable(); -} - -#endif /* SMP */ - -/* - * kexec thread structure and stack. - * - * We need to make sure that this is 16384-byte aligned due to the - * way process stacks are handled. It also must be statically allocated - * or allocated as part of the kimage, because everything else may be - * overwritten when we copy the kexec image. We piggyback on the - * "init_task" linker section here to statically allocate a stack. - * - * We could use a smaller stack if we don't care about anything using - * current, but that audit has not been performed. - */ -static union thread_union kexec_stack __init_task_data = - { }; - -/* - * For similar reasons to the stack above, the kexecing CPU needs to be on a - * static PACA; we switch to kexec_paca. - */ -struct paca_struct kexec_paca; - -/* Our assembly helper, in misc_64.S */ -extern void kexec_sequence(void *newstack, unsigned long start, - void *image, void *control, - void (*clear_all)(void), - bool copy_with_mmu_off) __noreturn; - -/* too late to fail here */ -void default_machine_kexec(struct kimage *image) -{ - bool copy_with_mmu_off; - - /* prepare control code if any */ - - /* - * If the kexec boot is the normal one, need to shutdown other cpus - * into our wait loop and quiesce interrupts. - * Otherwise, in the case of crashed mode (crashing_cpu >= 0), - * stopping other CPUs and collecting their pt_regs is done before - * using debugger IPI. - */ - - if (!kdump_in_progress()) - kexec_prepare_cpus(); - - printk("kexec: Starting switchover sequence.\n"); - - /* switch to a staticly allocated stack. Based on irq stack code. - * We setup preempt_count to avoid using VMX in memcpy. - * XXX: the task struct will likely be invalid once we do the copy! - */ - current_thread_info()->flags = 0; - current_thread_info()->preempt_count = HARDIRQ_OFFSET; - - /* We need a static PACA, too; copy this CPU's PACA over and switch to - * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using - * non-static data. - */ - memcpy(&kexec_paca, get_paca(), sizeof(struct paca_struct)); - kexec_paca.data_offset = 0xedeaddeadeeeeeeeUL; -#ifdef CONFIG_PPC_PSERIES - kexec_paca.lppaca_ptr = NULL; -#endif - paca_ptrs[kexec_paca.paca_index] = &kexec_paca; - - setup_paca(&kexec_paca); - - /* - * The lppaca should be unregistered at this point so the HV won't - * touch it. In the case of a crash, none of the lppacas are - * unregistered so there is not much we can do about it here. - */ - - /* - * On Book3S, the copy must happen with the MMU off if we are either - * using Radix page tables or we are not in an LPAR since we can - * overwrite the page tables while copying. - * - * In an LPAR, we keep the MMU on otherwise we can't access beyond - * the RMA. On BookE there is no real MMU off mode, so we have to - * keep it enabled as well (but then we have bolted TLB entries). - */ -#ifdef CONFIG_PPC_BOOK3E - copy_with_mmu_off = false; -#else - copy_with_mmu_off = radix_enabled() || - !(firmware_has_feature(FW_FEATURE_LPAR) || - firmware_has_feature(FW_FEATURE_PS3_LV1)); -#endif - - /* Some things are best done in assembly. Finding globals with - * a toc is easier in C, so pass in what we can. - */ - kexec_sequence(&kexec_stack, image->start, image, - page_address(image->control_code_page), - mmu_cleanup_all, copy_with_mmu_off); - /* NOTREACHED */ -} - -#ifdef CONFIG_PPC_BOOK3S_64 -/* Values we need to export to the second kernel via the device tree. */ -static unsigned long htab_base; -static unsigned long htab_size; - -static struct property htab_base_prop = { - .name = "linux,htab-base", - .length = sizeof(unsigned long), - .value = &htab_base, -}; - -static struct property htab_size_prop = { - .name = "linux,htab-size", - .length = sizeof(unsigned long), - .value = &htab_size, -}; - -static int __init export_htab_values(void) -{ - struct device_node *node; - - /* On machines with no htab htab_address is NULL */ - if (!htab_address) - return -ENODEV; - - node = of_find_node_by_path("/chosen"); - if (!node) - return -ENODEV; - - /* remove any stale propertys so ours can be found */ - of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); - of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); - - htab_base = cpu_to_be64(__pa(htab_address)); - of_add_property(node, &htab_base_prop); - htab_size = cpu_to_be64(htab_size_bytes); - of_add_property(node, &htab_size_prop); - - of_node_put(node); - return 0; -} -late_initcall(export_htab_values); -#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c deleted file mode 100644 index 143c91724617..000000000000 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ /dev/null @@ -1,254 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * ppc64 code to implement the kexec_file_load syscall - * - * Copyright (C) 2004 Adam Litke (agl@us.ibm.com) - * Copyright (C) 2004 IBM Corp. - * Copyright (C) 2004,2005 Milton D Miller II, IBM Corporation - * Copyright (C) 2005 R Sharada (sharada@in.ibm.com) - * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com) - * Copyright (C) 2016 IBM Corporation - * - * Based on kexec-tools' kexec-elf-ppc64.c, fs2dt.c. - * Heavily modified for the kernel by - * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>. - */ - -#include <linux/slab.h> -#include <linux/kexec.h> -#include <linux/of_fdt.h> -#include <linux/libfdt.h> -#include <asm/ima.h> - -#define SLAVE_CODE_SIZE 256 - -const struct kexec_file_ops * const kexec_file_loaders[] = { - &kexec_elf64_ops, - NULL -}; - -int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, - unsigned long buf_len) -{ - /* We don't support crash kernels yet. */ - if (image->type == KEXEC_TYPE_CRASH) - return -EOPNOTSUPP; - - return kexec_image_probe_default(image, buf, buf_len); -} - -/** - * setup_purgatory - initialize the purgatory's global variables - * @image: kexec image. - * @slave_code: Slave code for the purgatory. - * @fdt: Flattened device tree for the next kernel. - * @kernel_load_addr: Address where the kernel is loaded. - * @fdt_load_addr: Address where the flattened device tree is loaded. - * - * Return: 0 on success, or negative errno on error. - */ -int setup_purgatory(struct kimage *image, const void *slave_code, - const void *fdt, unsigned long kernel_load_addr, - unsigned long fdt_load_addr) -{ - unsigned int *slave_code_buf, master_entry; - int ret; - - slave_code_buf = kmalloc(SLAVE_CODE_SIZE, GFP_KERNEL); - if (!slave_code_buf) - return -ENOMEM; - - /* Get the slave code from the new kernel and put it in purgatory. */ - ret = kexec_purgatory_get_set_symbol(image, "purgatory_start", - slave_code_buf, SLAVE_CODE_SIZE, - true); - if (ret) { - kfree(slave_code_buf); - return ret; - } - - master_entry = slave_code_buf[0]; - memcpy(slave_code_buf, slave_code, SLAVE_CODE_SIZE); - slave_code_buf[0] = master_entry; - ret = kexec_purgatory_get_set_symbol(image, "purgatory_start", - slave_code_buf, SLAVE_CODE_SIZE, - false); - kfree(slave_code_buf); - - ret = kexec_purgatory_get_set_symbol(image, "kernel", &kernel_load_addr, - sizeof(kernel_load_addr), false); - if (ret) - return ret; - ret = kexec_purgatory_get_set_symbol(image, "dt_offset", &fdt_load_addr, - sizeof(fdt_load_addr), false); - if (ret) - return ret; - - return 0; -} - -/** - * delete_fdt_mem_rsv - delete memory reservation with given address and size - * - * Return: 0 on success, or negative errno on error. - */ -int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size) -{ - int i, ret, num_rsvs = fdt_num_mem_rsv(fdt); - - for (i = 0; i < num_rsvs; i++) { - uint64_t rsv_start, rsv_size; - - ret = fdt_get_mem_rsv(fdt, i, &rsv_start, &rsv_size); - if (ret) { - pr_err("Malformed device tree.\n"); - return -EINVAL; - } - - if (rsv_start == start && rsv_size == size) { - ret = fdt_del_mem_rsv(fdt, i); - if (ret) { - pr_err("Error deleting device tree reservation.\n"); - return -EINVAL; - } - - return 0; - } - } - - return -ENOENT; -} - -/* - * setup_new_fdt - modify /chosen and memory reservation for the next kernel - * @image: kexec image being loaded. - * @fdt: Flattened device tree for the next kernel. - * @initrd_load_addr: Address where the next initrd will be loaded. - * @initrd_len: Size of the next initrd, or 0 if there will be none. - * @cmdline: Command line for the next kernel, or NULL if there will - * be none. - * - * Return: 0 on success, or negative errno on error. - */ -int setup_new_fdt(const struct kimage *image, void *fdt, - unsigned long initrd_load_addr, unsigned long initrd_len, - const char *cmdline) -{ - int ret, chosen_node; - const void *prop; - - /* Remove memory reservation for the current device tree. */ - ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params), - fdt_totalsize(initial_boot_params)); - if (ret == 0) - pr_debug("Removed old device tree reservation.\n"); - else if (ret != -ENOENT) - return ret; - - chosen_node = fdt_path_offset(fdt, "/chosen"); - if (chosen_node == -FDT_ERR_NOTFOUND) { - chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), - "chosen"); - if (chosen_node < 0) { - pr_err("Error creating /chosen.\n"); - return -EINVAL; - } - } else if (chosen_node < 0) { - pr_err("Malformed device tree: error reading /chosen.\n"); - return -EINVAL; - } - - /* Did we boot using an initrd? */ - prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL); - if (prop) { - uint64_t tmp_start, tmp_end, tmp_size; - - tmp_start = fdt64_to_cpu(*((const fdt64_t *) prop)); - - prop = fdt_getprop(fdt, chosen_node, "linux,initrd-end", NULL); - if (!prop) { - pr_err("Malformed device tree.\n"); - return -EINVAL; - } - tmp_end = fdt64_to_cpu(*((const fdt64_t *) prop)); - - /* - * kexec reserves exact initrd size, while firmware may - * reserve a multiple of PAGE_SIZE, so check for both. - */ - tmp_size = tmp_end - tmp_start; - ret = delete_fdt_mem_rsv(fdt, tmp_start, tmp_size); - if (ret == -ENOENT) - ret = delete_fdt_mem_rsv(fdt, tmp_start, - round_up(tmp_size, PAGE_SIZE)); - if (ret == 0) - pr_debug("Removed old initrd reservation.\n"); - else if (ret != -ENOENT) - return ret; - - /* If there's no new initrd, delete the old initrd's info. */ - if (initrd_len == 0) { - ret = fdt_delprop(fdt, chosen_node, - "linux,initrd-start"); - if (ret) { - pr_err("Error deleting linux,initrd-start.\n"); - return -EINVAL; - } - - ret = fdt_delprop(fdt, chosen_node, "linux,initrd-end"); - if (ret) { - pr_err("Error deleting linux,initrd-end.\n"); - return -EINVAL; - } - } - } - - if (initrd_len) { - ret = fdt_setprop_u64(fdt, chosen_node, - "linux,initrd-start", - initrd_load_addr); - if (ret < 0) - goto err; - - /* initrd-end is the first address after the initrd image. */ - ret = fdt_setprop_u64(fdt, chosen_node, "linux,initrd-end", - initrd_load_addr + initrd_len); - if (ret < 0) - goto err; - - ret = fdt_add_mem_rsv(fdt, initrd_load_addr, initrd_len); - if (ret) { - pr_err("Error reserving initrd memory: %s\n", - fdt_strerror(ret)); - return -EINVAL; - } - } - - if (cmdline != NULL) { - ret = fdt_setprop_string(fdt, chosen_node, "bootargs", cmdline); - if (ret < 0) - goto err; - } else { - ret = fdt_delprop(fdt, chosen_node, "bootargs"); - if (ret && ret != -FDT_ERR_NOTFOUND) { - pr_err("Error deleting bootargs.\n"); - return -EINVAL; - } - } - - ret = setup_ima_buffer(image, fdt, chosen_node); - if (ret) { - pr_err("Error setting up the new device tree.\n"); - return ret; - } - - ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0); - if (ret) - goto err; - - return 0; - -err: - pr_err("Error setting up the new device tree.\n"); - return -EINVAL; -} diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b18df633eae9..34c1001e9e8b 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -33,13 +33,18 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_ue_event_queue); static void machine_check_process_queued_event(struct irq_work *work); -void machine_check_ue_event(struct machine_check_event *evt); +static void machine_check_ue_irq_work(struct irq_work *work); +static void machine_check_ue_event(struct machine_check_event *evt); static void machine_process_ue_event(struct work_struct *work); static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +static struct irq_work mce_ue_event_irq_work = { + .func = machine_check_ue_irq_work, +}; + DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); static void mce_set_error_info(struct machine_check_event *mce, @@ -144,6 +149,7 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; + mce->u.ue_error.ignore_event = mce_err->ignore_event; machine_check_ue_event(mce); } } @@ -199,11 +205,15 @@ void release_mce_event(void) get_mce_event(NULL, true); } +static void machine_check_ue_irq_work(struct irq_work *work) +{ + schedule_work(&mce_ue_event_work); +} /* * Queue up the MCE event which then can be handled later. */ -void machine_check_ue_event(struct machine_check_event *evt) +static void machine_check_ue_event(struct machine_check_event *evt) { int index; @@ -216,7 +226,7 @@ void machine_check_ue_event(struct machine_check_event *evt) memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); /* Queue work to process this event later. */ - schedule_work(&mce_ue_event_work); + irq_work_queue(&mce_ue_event_irq_work); } /* @@ -257,8 +267,17 @@ static void machine_process_ue_event(struct work_struct *work) /* * This should probably queued elsewhere, but * oh! well + * + * Don't report this machine check because the caller has a + * asked us to ignore the event, it has a fixup handler which + * will do the appropriate error handling and reporting. */ if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.ignore_event) { + __this_cpu_dec(mce_ue_count); + continue; + } + if (evt->u.ue_error.physical_address_provided) { unsigned long pfn; @@ -292,6 +311,12 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; evt = this_cpu_ptr(&mce_event_queue[index]); + + if (evt->error_type == MCE_ERROR_TYPE_UE && + evt->u.ue_error.ignore_event) { + __this_cpu_dec(mce_queue_count); + continue; + } machine_check_print_event_info(evt, false, false); __this_cpu_dec(mce_queue_count); } @@ -300,7 +325,7 @@ static void machine_check_process_queued_event(struct irq_work *work) void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest) { - const char *level, *sevstr, *subtype, *err_type; + const char *level, *sevstr, *subtype, *err_type, *initiator; uint64_t ea = 0, pa = 0; int n = 0; char dar_str[50]; @@ -385,6 +410,28 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } + switch(evt->initiator) { + case MCE_INITIATOR_CPU: + initiator = "CPU"; + break; + case MCE_INITIATOR_PCI: + initiator = "PCI"; + break; + case MCE_INITIATOR_ISA: + initiator = "ISA"; + break; + case MCE_INITIATOR_MEMORY: + initiator = "Memory"; + break; + case MCE_INITIATOR_POWERMGM: + initiator = "Power Management"; + break; + case MCE_INITIATOR_UNKNOWN: + default: + initiator = "Unknown"; + break; + } + switch (evt->error_type) { case MCE_ERROR_TYPE_UE: err_type = "UE"; @@ -451,6 +498,14 @@ void machine_check_print_event_info(struct machine_check_event *evt, if (evt->u.link_error.effective_address_provided) ea = evt->u.link_error.effective_address; break; + case MCE_ERROR_TYPE_DCACHE: + err_type = "D-Cache"; + subtype = "Unknown"; + break; + case MCE_ERROR_TYPE_ICACHE: + err_type = "I-Cache"; + subtype = "Unknown"; + break; default: case MCE_ERROR_TYPE_UNKNOWN: err_type = "Unknown"; @@ -483,9 +538,17 @@ void machine_check_print_event_info(struct machine_check_event *evt, level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str); } + printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator); + subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ? mc_error_class[evt->error_class] : "Unknown"; printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Display faulty slb contents for SLB errors. */ + if (evt->error_type == MCE_ERROR_TYPE_SLB) + slb_dump_contents(local_paca->mce_faulty_slbs); +#endif } EXPORT_SYMBOL_GPL(machine_check_print_event_info); diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index a814d2dfb5b0..1cbf7f1a4e3d 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -11,6 +11,7 @@ #include <linux/types.h> #include <linux/ptrace.h> +#include <linux/extable.h> #include <asm/mmu.h> #include <asm/mce.h> #include <asm/machdep.h> @@ -18,6 +19,7 @@ #include <asm/pte-walk.h> #include <asm/sstep.h> #include <asm/exception-64s.h> +#include <asm/extable.h> /* * Convert an address related to an mm to a PFN. NOTE: we are in real @@ -26,7 +28,8 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; - unsigned long flags; + unsigned int shift; + unsigned long pfn, flags; struct mm_struct *mm; if (user_mode(regs)) @@ -35,14 +38,23 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) mm = &init_mm; local_irq_save(flags); - if (mm == current->mm) - ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); - else - ptep = find_init_mm_pte(addr, NULL); + ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); + + if (!ptep || pte_special(*ptep)) { + pfn = ULONG_MAX; + goto out; + } + + if (shift <= PAGE_SHIFT) + pfn = pte_pfn(*ptep); + else { + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; + pfn = pte_pfn(__pte(pte_val(*ptep) | (addr & rpnmask))); + } + +out: local_irq_restore(flags); - if (!ptep || pte_special(*ptep)) - return ULONG_MAX; - return pte_pfn(*ptep); + return pfn; } /* flush SLBs and reload */ @@ -344,7 +356,7 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_SEVERE, true }, { 0, false, 0, 0, 0, 0, 0 } }; -static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, +static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr, uint64_t *phys_addr) { /* @@ -397,6 +409,8 @@ static int mce_handle_ierror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); handled = mce_flush(MCE_FLUSH_SLB); break; case MCE_ERROR_TYPE_ERAT: @@ -482,6 +496,8 @@ static int mce_handle_derror(struct pt_regs *regs, /* attempt to correct the error */ switch (table[i].error_type) { case MCE_ERROR_TYPE_SLB: + if (local_paca->in_mce == 1) + slb_save_contents(local_paca->mce_faulty_slbs); if (mce_flush(MCE_FLUSH_SLB)) handled = 1; break; @@ -541,7 +557,8 @@ static int mce_handle_derror(struct pt_regs *regs, * kernel/exception-64s.h */ if (get_paca()->in_mce < MAX_MCE_DEPTH) - mce_find_instr_ea_and_pfn(regs, addr, phys_addr); + mce_find_instr_ea_and_phys(regs, addr, + phys_addr); } found = 1; } @@ -558,9 +575,18 @@ static int mce_handle_derror(struct pt_regs *regs, return 0; } -static long mce_handle_ue_error(struct pt_regs *regs) +static long mce_handle_ue_error(struct pt_regs *regs, + struct mce_error_info *mce_err) { long handled = 0; + const struct exception_table_entry *entry; + + entry = search_kernel_exception_table(regs->nip); + if (entry) { + mce_err->ignore_event = true; + regs->nip = extable_fixup(entry); + return 1; + } /* * On specific SCOM read via MMIO we may get a machine check @@ -593,7 +619,7 @@ static long mce_handle_error(struct pt_regs *regs, &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); + handled = mce_handle_ue_error(regs, &mce_err); save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index fe4bd321730e..d80212be8698 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -6,11 +6,6 @@ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) * and Paul Mackerras. * - * kexec bits: - * Copyright (C) 2002-2003 Eric Biederman <ebiederm@xmission.com> - * GameCube/ppc32 port Copyright (C) 2004 Albert Herranz - * PPC44x port. Copyright (C) 2011, IBM Corporation - * Author: Suzuki Poulose <suzuki@in.ibm.com> */ #include <linux/sys.h> @@ -25,7 +20,6 @@ #include <asm/thread_info.h> #include <asm/asm-offsets.h> #include <asm/processor.h> -#include <asm/kexec.h> #include <asm/bug.h> #include <asm/ptrace.h> #include <asm/export.h> @@ -292,22 +286,20 @@ _GLOBAL(flush_instruction_cache) iccci 0,r3 #endif #elif defined(CONFIG_FSL_BOOKE) -BEGIN_FTR_SECTION +#ifdef CONFIG_E200 mfspr r3,SPRN_L1CSR0 ori r3,r3,L1CSR0_CFI|L1CSR0_CLFC /* msync; isync recommended here */ mtspr SPRN_L1CSR0,r3 isync blr -END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE) +#endif mfspr r3,SPRN_L1CSR1 ori r3,r3,L1CSR1_ICFI|L1CSR1_ICLFR mtspr SPRN_L1CSR1,r3 +#elif defined(CONFIG_PPC_BOOK3S_601) + blr /* for 601, do nothing */ #else - mfspr r3,SPRN_PVR - rlwinm r3,r3,16,16,31 - cmpwi 0,r3,1 - beqlr /* for 601, do nothing */ /* 603/604 processor - use invalidate-all bit in HID0 */ mfspr r3,SPRN_HID0 ori r3,r3,HID0_ICFI @@ -319,123 +311,6 @@ EXPORT_SYMBOL(flush_instruction_cache) #endif /* CONFIG_PPC_8xx */ /* - * Write any modified data cache blocks out to memory - * and invalidate the corresponding instruction cache blocks. - * This is a no-op on the 601. - * - * flush_icache_range(unsigned long start, unsigned long stop) - */ -_GLOBAL(flush_icache_range) -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr /* for 601, do nothing */ -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) - rlwinm r3,r3,0,0,31 - L1_CACHE_SHIFT - subf r4,r3,r4 - addi r4,r4,L1_CACHE_BYTES - 1 - srwi. r4,r4,L1_CACHE_SHIFT - beqlr - mtctr r4 - mr r6,r3 -1: dcbst 0,r3 - addi r3,r3,L1_CACHE_BYTES - bdnz 1b - sync /* wait for dcbst's to get to ram */ -#ifndef CONFIG_44x - mtctr r4 -2: icbi 0,r6 - addi r6,r6,L1_CACHE_BYTES - bdnz 2b -#else - /* Flash invalidate on 44x because we are passed kmapped addresses and - this doesn't work for userspace pages due to the virtually tagged - icache. Sigh. */ - iccci 0, r0 -#endif - sync /* additional sync needed on g4 */ - isync - blr -_ASM_NOKPROBE_SYMBOL(flush_icache_range) -EXPORT_SYMBOL(flush_icache_range) - -/* - * Flush a particular page from the data cache to RAM. - * Note: this is necessary because the instruction cache does *not* - * snoop from the data cache. - * This is a no-op on the 601 which has a unified cache. - * - * void __flush_dcache_icache(void *page) - */ -_GLOBAL(__flush_dcache_icache) -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) - rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ - li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ - mtctr r4 - mr r6,r3 -0: dcbst 0,r3 /* Write line to ram */ - addi r3,r3,L1_CACHE_BYTES - bdnz 0b - sync -#ifdef CONFIG_44x - /* We don't flush the icache on 44x. Those have a virtual icache - * and we don't have access to the virtual address here (it's - * not the page vaddr but where it's mapped in user space). The - * flushing of the icache on these is handled elsewhere, when - * a change in the address space occurs, before returning to - * user space - */ -BEGIN_MMU_FTR_SECTION - blr -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x) -#endif /* CONFIG_44x */ - mtctr r4 -1: icbi 0,r6 - addi r6,r6,L1_CACHE_BYTES - bdnz 1b - sync - isync - blr - -#ifndef CONFIG_BOOKE -/* - * Flush a particular page from the data cache to RAM, identified - * by its physical address. We turn off the MMU so we can just use - * the physical address (this may be a highmem page without a kernel - * mapping). - * - * void __flush_dcache_icache_phys(unsigned long physaddr) - */ -_GLOBAL(__flush_dcache_icache_phys) -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr /* for 601, do nothing */ -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) - mfmsr r10 - rlwinm r0,r10,0,28,26 /* clear DR */ - mtmsr r0 - isync - rlwinm r3,r3,0,0,31-PAGE_SHIFT /* Get page base address */ - li r4,PAGE_SIZE/L1_CACHE_BYTES /* Number of lines in a page */ - mtctr r4 - mr r6,r3 -0: dcbst 0,r3 /* Write line to ram */ - addi r3,r3,L1_CACHE_BYTES - bdnz 0b - sync - mtctr r4 -1: icbi 0,r6 - addi r6,r6,L1_CACHE_BYTES - bdnz 1b - sync - mtmsr r10 /* restore DR */ - isync - blr -#endif /* CONFIG_BOOKE */ - -/* * Copy a whole page. We use the dcbz instruction on the destination * to reduce memory traffic (it eliminates the unnecessary reads of * the destination into cache). This requires that the destination @@ -452,7 +327,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) stwu r9,16(r3) _GLOBAL(copy_page) + rlwinm r5, r3, 0, L1_CACHE_BYTES - 1 addi r3,r3,-4 + +0: twnei r5, 0 /* WARN if r3 is not cache aligned */ + EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + addi r4,r4,-4 li r5,4 @@ -608,488 +488,3 @@ _GLOBAL(start_secondary_resume) */ _GLOBAL(__main) blr - -#ifdef CONFIG_KEXEC_CORE - /* - * Must be relocatable PIC code callable as a C function. - */ - .globl relocate_new_kernel -relocate_new_kernel: - /* r3 = page_list */ - /* r4 = reboot_code_buffer */ - /* r5 = start_address */ - -#ifdef CONFIG_FSL_BOOKE - - mr r29, r3 - mr r30, r4 - mr r31, r5 - -#define ENTRY_MAPPING_KEXEC_SETUP -#include "fsl_booke_entry_mapping.S" -#undef ENTRY_MAPPING_KEXEC_SETUP - - mr r3, r29 - mr r4, r30 - mr r5, r31 - - li r0, 0 -#elif defined(CONFIG_44x) - - /* Save our parameters */ - mr r29, r3 - mr r30, r4 - mr r31, r5 - -#ifdef CONFIG_PPC_47x - /* Check for 47x cores */ - mfspr r3,SPRN_PVR - srwi r3,r3,16 - cmplwi cr0,r3,PVR_476FPE@h - beq setup_map_47x - cmplwi cr0,r3,PVR_476@h - beq setup_map_47x - cmplwi cr0,r3,PVR_476_ISS@h - beq setup_map_47x -#endif /* CONFIG_PPC_47x */ - -/* - * Code for setting up 1:1 mapping for PPC440x for KEXEC - * - * We cannot switch off the MMU on PPC44x. - * So we: - * 1) Invalidate all the mappings except the one we are running from. - * 2) Create a tmp mapping for our code in the other address space(TS) and - * jump to it. Invalidate the entry we started in. - * 3) Create a 1:1 mapping for 0-2GiB in chunks of 256M in original TS. - * 4) Jump to the 1:1 mapping in original TS. - * 5) Invalidate the tmp mapping. - * - * - Based on the kexec support code for FSL BookE - * - */ - - /* - * Load the PID with kernel PID (0). - * Also load our MSR_IS and TID to MMUCR for TLB search. - */ - li r3, 0 - mtspr SPRN_PID, r3 - mfmsr r4 - andi. r4,r4,MSR_IS@l - beq wmmucr - oris r3,r3,PPC44x_MMUCR_STS@h -wmmucr: - mtspr SPRN_MMUCR,r3 - sync - - /* - * Invalidate all the TLB entries except the current entry - * where we are running from - */ - bl 0f /* Find our address */ -0: mflr r5 /* Make it accessible */ - tlbsx r23,0,r5 /* Find entry we are in */ - li r4,0 /* Start at TLB entry 0 */ - li r3,0 /* Set PAGEID inval value */ -1: cmpw r23,r4 /* Is this our entry? */ - beq skip /* If so, skip the inval */ - tlbwe r3,r4,PPC44x_TLB_PAGEID /* If not, inval the entry */ -skip: - addi r4,r4,1 /* Increment */ - cmpwi r4,64 /* Are we done? */ - bne 1b /* If not, repeat */ - isync - - /* Create a temp mapping and jump to it */ - andi. r6, r23, 1 /* Find the index to use */ - addi r24, r6, 1 /* r24 will contain 1 or 2 */ - - mfmsr r9 /* get the MSR */ - rlwinm r5, r9, 27, 31, 31 /* Extract the MSR[IS] */ - xori r7, r5, 1 /* Use the other address space */ - - /* Read the current mapping entries */ - tlbre r3, r23, PPC44x_TLB_PAGEID - tlbre r4, r23, PPC44x_TLB_XLAT - tlbre r5, r23, PPC44x_TLB_ATTRIB - - /* Save our current XLAT entry */ - mr r25, r4 - - /* Extract the TLB PageSize */ - li r10, 1 /* r10 will hold PageSize */ - rlwinm r11, r3, 0, 24, 27 /* bits 24-27 */ - - /* XXX: As of now we use 256M, 4K pages */ - cmpwi r11, PPC44x_TLB_256M - bne tlb_4k - rotlwi r10, r10, 28 /* r10 = 256M */ - b write_out -tlb_4k: - cmpwi r11, PPC44x_TLB_4K - bne default - rotlwi r10, r10, 12 /* r10 = 4K */ - b write_out -default: - rotlwi r10, r10, 10 /* r10 = 1K */ - -write_out: - /* - * Write out the tmp 1:1 mapping for this code in other address space - * Fixup EPN = RPN , TS=other address space - */ - insrwi r3, r7, 1, 23 /* Bit 23 is TS for PAGEID field */ - - /* Write out the tmp mapping entries */ - tlbwe r3, r24, PPC44x_TLB_PAGEID - tlbwe r4, r24, PPC44x_TLB_XLAT - tlbwe r5, r24, PPC44x_TLB_ATTRIB - - subi r11, r10, 1 /* PageOffset Mask = PageSize - 1 */ - not r10, r11 /* Mask for PageNum */ - - /* Switch to other address space in MSR */ - insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - - bl 1f -1: mflr r8 - addi r8, r8, (2f-1b) /* Find the target offset */ - - /* Jump to the tmp mapping */ - mtspr SPRN_SRR0, r8 - mtspr SPRN_SRR1, r9 - rfi - -2: - /* Invalidate the entry we were executing from */ - li r3, 0 - tlbwe r3, r23, PPC44x_TLB_PAGEID - - /* attribute fields. rwx for SUPERVISOR mode */ - li r5, 0 - ori r5, r5, (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G) - - /* Create 1:1 mapping in 256M pages */ - xori r7, r7, 1 /* Revert back to Original TS */ - - li r8, 0 /* PageNumber */ - li r6, 3 /* TLB Index, start at 3 */ - -next_tlb: - rotlwi r3, r8, 28 /* Create EPN (bits 0-3) */ - mr r4, r3 /* RPN = EPN */ - ori r3, r3, (PPC44x_TLB_VALID | PPC44x_TLB_256M) /* SIZE = 256M, Valid */ - insrwi r3, r7, 1, 23 /* Set TS from r7 */ - - tlbwe r3, r6, PPC44x_TLB_PAGEID /* PageID field : EPN, V, SIZE */ - tlbwe r4, r6, PPC44x_TLB_XLAT /* Address translation : RPN */ - tlbwe r5, r6, PPC44x_TLB_ATTRIB /* Attributes */ - - addi r8, r8, 1 /* Increment PN */ - addi r6, r6, 1 /* Increment TLB Index */ - cmpwi r8, 8 /* Are we done ? */ - bne next_tlb - isync - - /* Jump to the new mapping 1:1 */ - li r9,0 - insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - - bl 1f -1: mflr r8 - and r8, r8, r11 /* Get our offset within page */ - addi r8, r8, (2f-1b) - - and r5, r25, r10 /* Get our target PageNum */ - or r8, r8, r5 /* Target jump address */ - - mtspr SPRN_SRR0, r8 - mtspr SPRN_SRR1, r9 - rfi -2: - /* Invalidate the tmp entry we used */ - li r3, 0 - tlbwe r3, r24, PPC44x_TLB_PAGEID - sync - b ppc44x_map_done - -#ifdef CONFIG_PPC_47x - - /* 1:1 mapping for 47x */ - -setup_map_47x: - - /* - * Load the kernel pid (0) to PID and also to MMUCR[TID]. - * Also set the MSR IS->MMUCR STS - */ - li r3, 0 - mtspr SPRN_PID, r3 /* Set PID */ - mfmsr r4 /* Get MSR */ - andi. r4, r4, MSR_IS@l /* TS=1? */ - beq 1f /* If not, leave STS=0 */ - oris r3, r3, PPC47x_MMUCR_STS@h /* Set STS=1 */ -1: mtspr SPRN_MMUCR, r3 /* Put MMUCR */ - sync - - /* Find the entry we are running from */ - bl 2f -2: mflr r23 - tlbsx r23, 0, r23 - tlbre r24, r23, 0 /* TLB Word 0 */ - tlbre r25, r23, 1 /* TLB Word 1 */ - tlbre r26, r23, 2 /* TLB Word 2 */ - - - /* - * Invalidates all the tlb entries by writing to 256 RPNs(r4) - * of 4k page size in all 4 ways (0-3 in r3). - * This would invalidate the entire UTLB including the one we are - * running from. However the shadow TLB entries would help us - * to continue the execution, until we flush them (rfi/isync). - */ - addis r3, 0, 0x8000 /* specify the way */ - addi r4, 0, 0 /* TLB Word0 = (EPN=0, VALID = 0) */ - addi r5, 0, 0 - b clear_utlb_entry - - /* Align the loop to speed things up. from head_44x.S */ - .align 6 - -clear_utlb_entry: - - tlbwe r4, r3, 0 - tlbwe r5, r3, 1 - tlbwe r5, r3, 2 - addis r3, r3, 0x2000 /* Increment the way */ - cmpwi r3, 0 - bne clear_utlb_entry - addis r3, 0, 0x8000 - addis r4, r4, 0x100 /* Increment the EPN */ - cmpwi r4, 0 - bne clear_utlb_entry - - /* Create the entries in the other address space */ - mfmsr r5 - rlwinm r7, r5, 27, 31, 31 /* Get the TS (Bit 26) from MSR */ - xori r7, r7, 1 /* r7 = !TS */ - - insrwi r24, r7, 1, 21 /* Change the TS in the saved TLB word 0 */ - - /* - * write out the TLB entries for the tmp mapping - * Use way '0' so that we could easily invalidate it later. - */ - lis r3, 0x8000 /* Way '0' */ - - tlbwe r24, r3, 0 - tlbwe r25, r3, 1 - tlbwe r26, r3, 2 - - /* Update the msr to the new TS */ - insrwi r5, r7, 1, 26 - - bl 1f -1: mflr r6 - addi r6, r6, (2f-1b) - - mtspr SPRN_SRR0, r6 - mtspr SPRN_SRR1, r5 - rfi - - /* - * Now we are in the tmp address space. - * Create a 1:1 mapping for 0-2GiB in the original TS. - */ -2: - li r3, 0 - li r4, 0 /* TLB Word 0 */ - li r5, 0 /* TLB Word 1 */ - li r6, 0 - ori r6, r6, PPC47x_TLB2_S_RWX /* TLB word 2 */ - - li r8, 0 /* PageIndex */ - - xori r7, r7, 1 /* revert back to original TS */ - -write_utlb: - rotlwi r5, r8, 28 /* RPN = PageIndex * 256M */ - /* ERPN = 0 as we don't use memory above 2G */ - - mr r4, r5 /* EPN = RPN */ - ori r4, r4, (PPC47x_TLB0_VALID | PPC47x_TLB0_256M) - insrwi r4, r7, 1, 21 /* Insert the TS to Word 0 */ - - tlbwe r4, r3, 0 /* Write out the entries */ - tlbwe r5, r3, 1 - tlbwe r6, r3, 2 - addi r8, r8, 1 - cmpwi r8, 8 /* Have we completed ? */ - bne write_utlb - - /* make sure we complete the TLB write up */ - isync - - /* - * Prepare to jump to the 1:1 mapping. - * 1) Extract page size of the tmp mapping - * DSIZ = TLB_Word0[22:27] - * 2) Calculate the physical address of the address - * to jump to. - */ - rlwinm r10, r24, 0, 22, 27 - - cmpwi r10, PPC47x_TLB0_4K - bne 0f - li r10, 0x1000 /* r10 = 4k */ - bl 1f - -0: - /* Defaults to 256M */ - lis r10, 0x1000 - - bl 1f -1: mflr r4 - addi r4, r4, (2f-1b) /* virtual address of 2f */ - - subi r11, r10, 1 /* offsetmask = Pagesize - 1 */ - not r10, r11 /* Pagemask = ~(offsetmask) */ - - and r5, r25, r10 /* Physical page */ - and r6, r4, r11 /* offset within the current page */ - - or r5, r5, r6 /* Physical address for 2f */ - - /* Switch the TS in MSR to the original one */ - mfmsr r8 - insrwi r8, r7, 1, 26 - - mtspr SPRN_SRR1, r8 - mtspr SPRN_SRR0, r5 - rfi - -2: - /* Invalidate the tmp mapping */ - lis r3, 0x8000 /* Way '0' */ - - clrrwi r24, r24, 12 /* Clear the valid bit */ - tlbwe r24, r3, 0 - tlbwe r25, r3, 1 - tlbwe r26, r3, 2 - - /* Make sure we complete the TLB write and flush the shadow TLB */ - isync - -#endif - -ppc44x_map_done: - - - /* Restore the parameters */ - mr r3, r29 - mr r4, r30 - mr r5, r31 - - li r0, 0 -#else - li r0, 0 - - /* - * Set Machine Status Register to a known status, - * switch the MMU off and jump to 1: in a single step. - */ - - mr r8, r0 - ori r8, r8, MSR_RI|MSR_ME - mtspr SPRN_SRR1, r8 - addi r8, r4, 1f - relocate_new_kernel - mtspr SPRN_SRR0, r8 - sync - rfi - -1: -#endif - /* from this point address translation is turned off */ - /* and interrupts are disabled */ - - /* set a new stack at the bottom of our page... */ - /* (not really needed now) */ - addi r1, r4, KEXEC_CONTROL_PAGE_SIZE - 8 /* for LR Save+Back Chain */ - stw r0, 0(r1) - - /* Do the copies */ - li r6, 0 /* checksum */ - mr r0, r3 - b 1f - -0: /* top, read another word for the indirection page */ - lwzu r0, 4(r3) - -1: - /* is it a destination page? (r8) */ - rlwinm. r7, r0, 0, 31, 31 /* IND_DESTINATION (1<<0) */ - beq 2f - - rlwinm r8, r0, 0, 0, 19 /* clear kexec flags, page align */ - b 0b - -2: /* is it an indirection page? (r3) */ - rlwinm. r7, r0, 0, 30, 30 /* IND_INDIRECTION (1<<1) */ - beq 2f - - rlwinm r3, r0, 0, 0, 19 /* clear kexec flags, page align */ - subi r3, r3, 4 - b 0b - -2: /* are we done? */ - rlwinm. r7, r0, 0, 29, 29 /* IND_DONE (1<<2) */ - beq 2f - b 3f - -2: /* is it a source page? (r9) */ - rlwinm. r7, r0, 0, 28, 28 /* IND_SOURCE (1<<3) */ - beq 0b - - rlwinm r9, r0, 0, 0, 19 /* clear kexec flags, page align */ - - li r7, PAGE_SIZE / 4 - mtctr r7 - subi r9, r9, 4 - subi r8, r8, 4 -9: - lwzu r0, 4(r9) /* do the copy */ - xor r6, r6, r0 - stwu r0, 4(r8) - dcbst 0, r8 - sync - icbi 0, r8 - bdnz 9b - - addi r9, r9, 4 - addi r8, r8, 4 - b 0b - -3: - - /* To be certain of avoiding problems with self-modifying code - * execute a serializing instruction here. - */ - isync - sync - - mfspr r3, SPRN_PIR /* current core we are running on */ - mr r4, r5 /* load physical address of chunk called */ - - /* jump to the entry point, usually the setup routine */ - mtlr r5 - blrl - -1: b 1b - -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .long relocate_new_kernel_end - relocate_new_kernel -#endif diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index b55a7b4cb543..1864605eca29 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -49,108 +49,6 @@ _GLOBAL(call_do_irq) mtlr r0 blr - .section ".toc","aw" -PPC64_CACHES: - .tc ppc64_caches[TC],ppc64_caches - .section ".text" - -/* - * Write any modified data cache blocks out to memory - * and invalidate the corresponding instruction cache blocks. - * - * flush_icache_range(unsigned long start, unsigned long stop) - * - * flush all bytes from start through stop-1 inclusive - */ - -_GLOBAL_TOC(flush_icache_range) -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) -/* - * Flush the data cache to memory - * - * Different systems have different cache line sizes - * and in some cases i-cache and d-cache line sizes differ from - * each other. - */ - ld r10,PPC64_CACHES@toc(r2) - lwz r7,DCACHEL1BLOCKSIZE(r10)/* Get cache block size */ - addi r5,r7,-1 - andc r6,r3,r5 /* round low to line bdy */ - subf r8,r6,r4 /* compute length */ - add r8,r8,r5 /* ensure we get enough */ - lwz r9,DCACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of cache block size */ - srw. r8,r8,r9 /* compute line count */ - beqlr /* nothing to do? */ - mtctr r8 -1: dcbst 0,r6 - add r6,r6,r7 - bdnz 1b - sync - -/* Now invalidate the instruction cache */ - - lwz r7,ICACHEL1BLOCKSIZE(r10) /* Get Icache block size */ - addi r5,r7,-1 - andc r6,r3,r5 /* round low to line bdy */ - subf r8,r6,r4 /* compute length */ - add r8,r8,r5 - lwz r9,ICACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of Icache block size */ - srw. r8,r8,r9 /* compute line count */ - beqlr /* nothing to do? */ - mtctr r8 -2: icbi 0,r6 - add r6,r6,r7 - bdnz 2b - isync - blr -_ASM_NOKPROBE_SYMBOL(flush_icache_range) -EXPORT_SYMBOL(flush_icache_range) - -/* - * Flush a particular page from the data cache to RAM. - * Note: this is necessary because the instruction cache does *not* - * snoop from the data cache. - * - * void __flush_dcache_icache(void *page) - */ -_GLOBAL(__flush_dcache_icache) -/* - * Flush the data cache to memory - * - * Different systems have different cache line sizes - */ - -BEGIN_FTR_SECTION - PURGE_PREFETCHED_INS - blr -END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) - -/* Flush the dcache */ - ld r7,PPC64_CACHES@toc(r2) - clrrdi r3,r3,PAGE_SHIFT /* Page align */ - lwz r4,DCACHEL1BLOCKSPERPAGE(r7) /* Get # dcache blocks per page */ - lwz r5,DCACHEL1BLOCKSIZE(r7) /* Get dcache block size */ - mr r6,r3 - mtctr r4 -0: dcbst 0,r6 - add r6,r6,r5 - bdnz 0b - sync - -/* Now invalidate the icache */ - - lwz r4,ICACHEL1BLOCKSPERPAGE(r7) /* Get # icache blocks per page */ - lwz r5,ICACHEL1BLOCKSIZE(r7) /* Get icache block size */ - mtctr r4 -1: icbi 0,r3 - add r3,r3,r5 - bdnz 1b - isync - blr - _GLOBAL(__bswapdi2) EXPORT_SYMBOL(__bswapdi2) srdi r8,r3,32 @@ -432,18 +330,13 @@ kexec_create_tlb: rlwimi r9,r10,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r9) */ /* Set up a temp identity mapping v:0 to p:0 and return to it. */ -#if defined(CONFIG_SMP) || defined(CONFIG_PPC_E500MC) -#define M_IF_NEEDED MAS2_M -#else -#define M_IF_NEEDED 0 -#endif mtspr SPRN_MAS0,r9 lis r9,(MAS1_VALID|MAS1_IPROT)@h ori r9,r9,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l mtspr SPRN_MAS1,r9 - LOAD_REG_IMMEDIATE(r9, 0x0 | M_IF_NEEDED) + LOAD_REG_IMMEDIATE(r9, 0x0 | MAS2_M_IF_NEEDED) mtspr SPRN_MAS2,r9 LOAD_REG_IMMEDIATE(r9, 0x0 | MAS3_SR | MAS3_SW | MAS3_SX) diff --git a/arch/powerpc/kernel/note.S b/arch/powerpc/kernel/note.S new file mode 100644 index 000000000000..bcdad15395dd --- /dev/null +++ b/arch/powerpc/kernel/note.S @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PowerPC ELF notes. + * + * Copyright 2019, IBM Corporation + */ + +#include <linux/elfnote.h> +#include <asm/elfnote.h> + +/* + * Ultravisor-capable bit (PowerNV only). + * + * Bit 0 indicates that the powerpc kernel binary knows how to run in an + * ultravisor-enabled system. + * + * In an ultravisor-enabled system, some machine resources are now controlled + * by the ultravisor. If the kernel is not ultravisor-capable, but it ends up + * being run on a machine with ultravisor, the kernel will probably crash + * trying to access ultravisor resources. For instance, it may crash in early + * boot trying to set the partition table entry 0. + * + * In an ultravisor-enabled system, a bootloader could warn the user or prevent + * the kernel from being run if the PowerPC ultravisor capability doesn't exist + * or the Ultravisor-capable bit is not set. + */ +#ifdef CONFIG_PPC_POWERNV +#define PPCCAP_ULTRAVISOR_BIT (1 << 0) +#else +#define PPCCAP_ULTRAVISOR_BIT 0 +#endif + +/* + * Add the PowerPC Capabilities in the binary ELF note. It is a bitmap that + * can be used to advertise kernel capabilities to userland. + */ +#define PPC_CAPABILITIES_BITMAP (PPCCAP_ULTRAVISOR_BIT) + +ELFNOTE(PowerPC, PPC_ELFNOTE_CAPABILITIES, + .long PPC_CAPABILITIES_BITMAP) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e3ad8aa4730d..949eceb254d8 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -14,6 +14,8 @@ #include <asm/sections.h> #include <asm/pgtable.h> #include <asm/kexec.h> +#include <asm/svm.h> +#include <asm/ultravisor.h> #include "setup.h" @@ -52,6 +54,43 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #ifdef CONFIG_PPC_PSERIES +#define LPPACA_SIZE 0x400 + +static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, + unsigned long limit, int cpu) +{ + size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE); + static unsigned long shared_lppaca_size; + static void *shared_lppaca; + void *ptr; + + if (!shared_lppaca) { + memblock_set_bottom_up(true); + + shared_lppaca = + memblock_alloc_try_nid(shared_lppaca_total_size, + PAGE_SIZE, MEMBLOCK_LOW_LIMIT, + limit, NUMA_NO_NODE); + if (!shared_lppaca) + panic("cannot allocate shared data"); + + memblock_set_bottom_up(false); + uv_share_page(PHYS_PFN(__pa(shared_lppaca)), + shared_lppaca_total_size >> PAGE_SHIFT); + } + + ptr = shared_lppaca + shared_lppaca_size; + shared_lppaca_size += size; + + /* + * This is very early in boot, so no harm done if the kernel crashes at + * this point. + */ + BUG_ON(shared_lppaca_size >= shared_lppaca_total_size); + + return ptr; +} + /* * See asm/lppaca.h for more detail. * @@ -65,7 +104,7 @@ static inline void init_lppaca(struct lppaca *lppaca) *lppaca = (struct lppaca) { .desc = cpu_to_be32(0xd397d781), /* "LpPa" */ - .size = cpu_to_be16(0x400), + .size = cpu_to_be16(LPPACA_SIZE), .fpregs_in_use = 1, .slb_count = cpu_to_be16(64), .vmxregs_in_use = 0, @@ -75,19 +114,22 @@ static inline void init_lppaca(struct lppaca *lppaca) static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) { struct lppaca *lp; - size_t size = 0x400; - BUILD_BUG_ON(size < sizeof(struct lppaca)); + BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE); if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - lp = alloc_paca_data(size, 0x400, limit, cpu); + if (is_secure_guest()) + lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu); + else + lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); + init_lppaca(lp); return lp; } -#endif /* CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_PSERIES */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index f627e15bb43c..c6c03416a151 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -261,12 +261,6 @@ int pcibios_sriov_disable(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ -void pcibios_bus_add_device(struct pci_dev *pdev) -{ - if (ppc_md.pcibios_bus_add_device) - ppc_md.pcibios_bus_add_device(pdev); -} - static resource_size_t pcibios_io_size(const struct pci_controller *hose) { #ifdef CONFIG_PPC64 @@ -964,7 +958,7 @@ void pcibios_setup_bus_self(struct pci_bus *bus) phb->controller_ops.dma_bus_setup(bus); } -static void pcibios_setup_device(struct pci_dev *dev) +void pcibios_bus_add_device(struct pci_dev *dev) { struct pci_controller *phb; /* Fixup NUMA node as it may not be setup yet by the generic @@ -985,17 +979,13 @@ static void pcibios_setup_device(struct pci_dev *dev) pci_read_irq_line(dev); if (ppc_md.pci_irq_fixup) ppc_md.pci_irq_fixup(dev); + + if (ppc_md.pcibios_bus_add_device) + ppc_md.pcibios_bus_add_device(dev); } int pcibios_add_device(struct pci_dev *dev) { - /* - * We can only call pcibios_setup_device() after bus setup is complete, - * since some of the platform specific DMA setup code depends on it. - */ - if (dev->bus->is_added) - pcibios_setup_device(dev); - #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); @@ -1004,24 +994,6 @@ int pcibios_add_device(struct pci_dev *dev) return 0; } -void pcibios_setup_bus_devices(struct pci_bus *bus) -{ - struct pci_dev *dev; - - pr_debug("PCI: Fixup bus devices %d (%s)\n", - bus->number, bus->self ? pci_name(bus->self) : "PHB"); - - list_for_each_entry(dev, &bus->devices, bus_list) { - /* Cardbus can call us to add new devices to a bus, so ignore - * those who are already fully discovered - */ - if (pci_dev_is_added(dev)) - continue; - - pcibios_setup_device(dev); - } -} - void pcibios_set_master(struct pci_dev *dev) { /* No special bus mastering setup handling */ @@ -1037,19 +1009,9 @@ void pcibios_fixup_bus(struct pci_bus *bus) /* Now fixup the bus bus */ pcibios_setup_bus_self(bus); - - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); } EXPORT_SYMBOL(pcibios_fixup_bus); -void pci_fixup_cardbus(struct pci_bus *bus) -{ - /* Now fixup devices on that bus */ - pcibios_setup_bus_devices(bus); -} - - static int skip_isa_ioresource_align(struct pci_dev *dev) { if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && @@ -1379,10 +1341,6 @@ void __init pcibios_resource_survey(void) pr_debug("PCI: Assigning unassigned resources...\n"); pci_assign_unassigned_resources(); } - - /* Call machine dependent fixup */ - if (ppc_md.pcibios_fixup) - ppc_md.pcibios_fixup(); } /* This is used by the PCI hotplug driver to allocate resource diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 0b0cf8168b47..d6a67f814983 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -55,11 +55,18 @@ EXPORT_SYMBOL_GPL(pci_find_bus_by_node); void pcibios_release_device(struct pci_dev *dev) { struct pci_controller *phb = pci_bus_to_host(dev->bus); + struct pci_dn *pdn = pci_get_pdn(dev); eeh_remove_device(dev); if (phb->controller_ops.release_device) phb->controller_ops.release_device(dev); + + /* free()ing the pci_dn has been deferred to us, do it now */ + if (pdn && (pdn->flags & PCI_DN_FLAG_DEAD)) { + pci_dbg(dev, "freeing dead pdn\n"); + kfree(pdn); + } } /** @@ -127,7 +134,6 @@ void pci_hp_add_devices(struct pci_bus *bus) */ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); - pcibios_setup_bus_devices(bus); max = bus->busn_res.start; /* * Scan bridges that are already configured. We don't touch diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 50942a1d1a5f..b49e1060a3bf 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -263,6 +263,10 @@ static int __init pcibios_init(void) /* Call common code to handle resource allocation */ pcibios_resource_survey(); + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + /* Call machine dependent post-init code */ if (ppc_md.pcibios_after_init) ppc_md.pcibios_after_init(); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index b7030b1189d0..f83d1f69b1dd 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -54,14 +54,20 @@ static int __init pcibios_init(void) pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); /* Scan all of the recorded PCI controllers. */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) pcibios_scan_phb(hose); - pci_bus_add_devices(hose->bus); - } /* Call common code to handle resource allocation */ pcibios_resource_survey(); + /* Add devices. */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + pci_bus_add_devices(hose->bus); + + /* Call machine dependent fixup */ + if (ppc_md.pcibios_fixup) + ppc_md.pcibios_fixup(); + printk(KERN_DEBUG "PCI: Probing PCI hardware done\n"); return 0; diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index c4c8c237a106..4e654df55969 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -125,7 +125,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) } #ifdef CONFIG_PCI_IOV -static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, +static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent, int vf_index, int busno, int devfn) { @@ -151,17 +151,15 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return pdn; } -#endif -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) +struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev) { -#ifdef CONFIG_PCI_IOV struct pci_dn *parent, *pdn; int i; /* Only support IOV for now */ - if (!pdev->is_physfn) - return pci_get_pdn(pdev); + if (WARN_ON(!pdev->is_physfn)) + return NULL; /* Check if VFs have been populated */ pdn = pci_get_pdn(pdev); @@ -176,7 +174,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { struct eeh_dev *edev __maybe_unused; - pdn = add_one_dev_pci_data(parent, i, + pdn = add_one_sriov_vf_pdn(parent, i, pci_iov_virtfn_bus(pdev, i), pci_iov_virtfn_devfn(pdev, i)); if (!pdn) { @@ -192,31 +190,17 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) edev->physfn = pdev; #endif /* CONFIG_EEH */ } -#endif /* CONFIG_PCI_IOV */ - return pci_get_pdn(pdev); } -void remove_dev_pci_data(struct pci_dev *pdev) +void remove_sriov_vf_pdns(struct pci_dev *pdev) { -#ifdef CONFIG_PCI_IOV struct pci_dn *parent; struct pci_dn *pdn, *tmp; int i; - /* - * VF and VF PE are created/released dynamically, so we need to - * bind/unbind them. Otherwise the VF and VF PE would be mismatched - * when re-enabling SR-IOV. - */ - if (pdev->is_virtfn) { - pdn = pci_get_pdn(pdev); - pdn->pe_number = IODA_INVALID_PE; - return; - } - /* Only support IOV PF for now */ - if (!pdev->is_physfn) + if (WARN_ON(!pdev->is_physfn)) return; /* Check if VFs have been populated */ @@ -244,9 +228,22 @@ void remove_dev_pci_data(struct pci_dev *pdev) continue; #ifdef CONFIG_EEH - /* Release EEH device for the VF */ + /* + * Release EEH state for this VF. The PCI core + * has already torn down the pci_dev for this VF, but + * we're responsible to removing the eeh_dev since it + * has the same lifetime as the pci_dn that spawned it. + */ edev = pdn_to_eeh_dev(pdn); if (edev) { + /* + * We allocate pci_dn's for the totalvfs count, + * but only only the vfs that were activated + * have a configured PE. + */ + if (edev->pe) + eeh_rmv_from_parent_pe(edev); + pdn->edev = NULL; kfree(edev); } @@ -258,8 +255,8 @@ void remove_dev_pci_data(struct pci_dev *pdev) kfree(pdn); } } -#endif /* CONFIG_PCI_IOV */ } +#endif /* CONFIG_PCI_IOV */ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, struct device_node *dn) @@ -323,6 +320,7 @@ void pci_remove_device_node_info(struct device_node *dn) { struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; struct device_node *parent; + struct pci_dev *pdev; #ifdef CONFIG_EEH struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -336,12 +334,28 @@ void pci_remove_device_node_info(struct device_node *dn) WARN_ON(!list_empty(&pdn->child_list)); list_del(&pdn->list); + /* Drop the parent pci_dn's ref to our backing dt node */ parent = of_get_parent(dn); if (parent) of_node_put(parent); - dn->data = NULL; - kfree(pdn); + /* + * At this point we *might* still have a pci_dev that was + * instantiated from this pci_dn. So defer free()ing it until + * the pci_dev's release function is called. + */ + pdev = pci_get_domain_bus_and_slot(pdn->phb->global_number, + pdn->busno, pdn->devfn); + if (pdev) { + /* NB: pdev has a ref to dn */ + pci_dbg(pdev, "marked pdn (from %pOF) as dead\n", dn); + pdn->flags |= PCI_DN_FLAG_DEAD; + } else { + dn->data = NULL; + kfree(pdn); + } + + pci_dev_put(pdev); } EXPORT_SYMBOL_GPL(pci_remove_device_node_info); diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index 409c6c1beabf..c3024f104765 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -34,31 +34,75 @@ static u32 get_int_prop(struct device_node *np, const char *name, u32 def) * pci_parse_of_flags - Parse the flags cell of a device tree PCI address * @addr0: value of 1st cell of a device tree PCI address. * @bridge: Set this flag if the address is from a bridge 'ranges' property + * + * PCI Bus Binding to IEEE Std 1275-1994 + * + * Bit# 33222222 22221111 11111100 00000000 + * 10987654 32109876 54321098 76543210 + * phys.hi cell: npt000ss bbbbbbbb dddddfff rrrrrrrr + * phys.mid cell: hhhhhhhh hhhhhhhh hhhhhhhh hhhhhhhh + * phys.lo cell: llllllll llllllll llllllll llllllll + * + * where: + * n is 0 if the address is relocatable, 1 otherwise + * p is 1 if the addressable region is "prefetchable", 0 otherwise + * t is 1 if the address is aliased (for non-relocatable I/O), + * below 1 MB (for Memory),or below 64 KB (for relocatable I/O). + * ss is the space code, denoting the address space: + * 00 denotes Configuration Space + * 01 denotes I/O Space + * 10 denotes 32-bit-address Memory Space + * 11 denotes 64-bit-address Memory Space + * bbbbbbbb is the 8-bit Bus Number + * ddddd is the 5-bit Device Number + * fff is the 3-bit Function Number + * rrrrrrrr is the 8-bit Register Number */ +#define OF_PCI_ADDR0_SPACE(ss) (((ss)&3)<<24) +#define OF_PCI_ADDR0_SPACE_CFG OF_PCI_ADDR0_SPACE(0) +#define OF_PCI_ADDR0_SPACE_IO OF_PCI_ADDR0_SPACE(1) +#define OF_PCI_ADDR0_SPACE_MMIO32 OF_PCI_ADDR0_SPACE(2) +#define OF_PCI_ADDR0_SPACE_MMIO64 OF_PCI_ADDR0_SPACE(3) +#define OF_PCI_ADDR0_SPACE_MASK OF_PCI_ADDR0_SPACE(3) +#define OF_PCI_ADDR0_RELOC (1UL<<31) +#define OF_PCI_ADDR0_PREFETCH (1UL<<30) +#define OF_PCI_ADDR0_ALIAS (1UL<<29) +#define OF_PCI_ADDR0_BUS 0x00FF0000UL +#define OF_PCI_ADDR0_DEV 0x0000F800UL +#define OF_PCI_ADDR0_FN 0x00000700UL +#define OF_PCI_ADDR0_BARREG 0x000000FFUL + unsigned int pci_parse_of_flags(u32 addr0, int bridge) { - unsigned int flags = 0; + unsigned int flags = 0, as = addr0 & OF_PCI_ADDR0_SPACE_MASK; - if (addr0 & 0x02000000) { + if (as == OF_PCI_ADDR0_SPACE_MMIO32 || as == OF_PCI_ADDR0_SPACE_MMIO64) { flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; - if (flags & PCI_BASE_ADDRESS_MEM_TYPE_64) - flags |= IORESOURCE_MEM_64; - flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; - if (addr0 & 0x40000000) - flags |= IORESOURCE_PREFETCH - | PCI_BASE_ADDRESS_MEM_PREFETCH; + + if (as == OF_PCI_ADDR0_SPACE_MMIO64) + flags |= PCI_BASE_ADDRESS_MEM_TYPE_64 | IORESOURCE_MEM_64; + + if (addr0 & OF_PCI_ADDR0_ALIAS) + flags |= PCI_BASE_ADDRESS_MEM_TYPE_1M; + + if (addr0 & OF_PCI_ADDR0_PREFETCH) + flags |= IORESOURCE_PREFETCH | + PCI_BASE_ADDRESS_MEM_PREFETCH; + /* Note: We don't know whether the ROM has been left enabled * by the firmware or not. We mark it as disabled (ie, we do * not set the IORESOURCE_ROM_ENABLE flag) for now rather than * do a config space read, it will be force-enabled if needed */ - if (!bridge && (addr0 & 0xff) == 0x30) + if (!bridge && (addr0 & OF_PCI_ADDR0_BARREG) == PCI_ROM_ADDRESS) flags |= IORESOURCE_READONLY; - } else if (addr0 & 0x01000000) + + } else if (as == OF_PCI_ADDR0_SPACE_IO) flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + if (flags) flags |= IORESOURCE_SIZEALIGN; + return flags; } @@ -370,7 +414,6 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, */ if (!rescan_existing) pcibios_setup_bus_self(bus); - pcibios_setup_bus_devices(bus); /* Now scan child busses */ for_each_pci_bridge(dev, bus) diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index be3758d54e59..877817471e3c 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -39,10 +39,10 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) return 0; } -static const struct file_operations page_map_fops = { - .llseek = page_map_seek, - .read = page_map_read, - .mmap = page_map_mmap +static const struct proc_ops page_map_proc_ops = { + .proc_lseek = page_map_seek, + .proc_read = page_map_read, + .proc_mmap = page_map_mmap, }; @@ -51,7 +51,7 @@ static int __init proc_ppc64_init(void) struct proc_dir_entry *pde; pde = proc_create_data("powerpc/systemcfg", S_IFREG | 0444, NULL, - &page_map_fops, vdso_data); + &page_map_proc_ops, vdso_data); if (!pde) return 1; proc_set_size(pde, PAGE_SIZE); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8fc4de0d22b4..fad50db9dcf2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -101,21 +101,8 @@ static void check_if_tm_restore_required(struct task_struct *tsk) } } -static bool tm_active_with_fp(struct task_struct *tsk) -{ - return MSR_TM_ACTIVE(tsk->thread.regs->msr) && - (tsk->thread.ckpt_regs.msr & MSR_FP); -} - -static bool tm_active_with_altivec(struct task_struct *tsk) -{ - return MSR_TM_ACTIVE(tsk->thread.regs->msr) && - (tsk->thread.ckpt_regs.msr & MSR_VEC); -} #else static inline void check_if_tm_restore_required(struct task_struct *tsk) { } -static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } -static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ bool strict_msr_control; @@ -252,7 +239,7 @@ EXPORT_SYMBOL(enable_kernel_fp); static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { + if (tsk->thread.load_fp) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -334,8 +321,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); static int restore_altivec(struct task_struct *tsk) { - if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { + if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load_vec++; @@ -497,13 +483,14 @@ void giveup_all(struct task_struct *tsk) if (!tsk->thread.regs) return; + check_if_tm_restore_required(tsk); + usermsr = tsk->thread.regs->msr; if ((usermsr & msr_all_available) == 0) return; msr_check_and_set(msr_all_available); - check_if_tm_restore_required(tsk); WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); @@ -728,6 +715,8 @@ static void set_debug_reg_defaults(struct thread_struct *thread) { thread->hw_brk.address = 0; thread->hw_brk.type = 0; + thread->hw_brk.len = 0; + thread->hw_brk.hw_len = 0; if (ppc_breakpoint_available()) set_breakpoint(&thread->hw_brk); } @@ -751,28 +740,6 @@ static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) mtspr(SPRN_DABRX, dabrx); return 0; } -#elif defined(CONFIG_PPC_8xx) -static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) -{ - unsigned long addr = dabr & ~HW_BRK_TYPE_DABR; - unsigned long lctrl1 = 0x90000000; /* compare type: equal on E & F */ - unsigned long lctrl2 = 0x8e000002; /* watchpoint 1 on cmp E | F */ - - if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ) - lctrl1 |= 0xa0000; - else if ((dabr & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE) - lctrl1 |= 0xf0000; - else if ((dabr & HW_BRK_TYPE_RDWR) == 0) - lctrl2 = 0; - - mtspr(SPRN_LCTRL2, 0); - mtspr(SPRN_CMPE, addr); - mtspr(SPRN_CMPF, addr + 4); - mtspr(SPRN_LCTRL1, lctrl1); - mtspr(SPRN_LCTRL2, lctrl2); - - return 0; -} #else static inline int __set_dabr(unsigned long dabr, unsigned long dabrx) { @@ -793,6 +760,39 @@ static inline int set_dabr(struct arch_hw_breakpoint *brk) return __set_dabr(dabr, dabrx); } +static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk) +{ + unsigned long lctrl1 = LCTRL1_CTE_GT | LCTRL1_CTF_LT | LCTRL1_CRWE_RW | + LCTRL1_CRWF_RW; + unsigned long lctrl2 = LCTRL2_LW0EN | LCTRL2_LW0LADC | LCTRL2_SLW0EN; + unsigned long start_addr = brk->address & ~HW_BREAKPOINT_ALIGN; + unsigned long end_addr = (brk->address + brk->len - 1) | HW_BREAKPOINT_ALIGN; + + if (start_addr == 0) + lctrl2 |= LCTRL2_LW0LA_F; + else if (end_addr == ~0U) + lctrl2 |= LCTRL2_LW0LA_E; + else + lctrl2 |= LCTRL2_LW0LA_EandF; + + mtspr(SPRN_LCTRL2, 0); + + if ((brk->type & HW_BRK_TYPE_RDWR) == 0) + return 0; + + if ((brk->type & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_READ) + lctrl1 |= LCTRL1_CRWE_RO | LCTRL1_CRWF_RO; + if ((brk->type & HW_BRK_TYPE_RDWR) == HW_BRK_TYPE_WRITE) + lctrl1 |= LCTRL1_CRWE_WO | LCTRL1_CRWF_WO; + + mtspr(SPRN_CMPE, start_addr - 1); + mtspr(SPRN_CMPF, end_addr + 1); + mtspr(SPRN_LCTRL1, lctrl1); + mtspr(SPRN_LCTRL2, lctrl2); + + return 0; +} + void __set_breakpoint(struct arch_hw_breakpoint *brk) { memcpy(this_cpu_ptr(¤t_brk), brk, sizeof(*brk)); @@ -800,6 +800,8 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk) if (dawr_enabled()) // Power8 or later set_dawr(brk); + else if (IS_ENABLED(CONFIG_PPC_8xx)) + set_breakpoint_8xx(brk); else if (!cpu_has_feature(CPU_FTR_ARCH_207S)) // Power7 or earlier set_dabr(brk); @@ -829,6 +831,7 @@ static inline bool hw_brk_match(struct arch_hw_breakpoint *a, return false; if (a->len != b->len) return false; + /* no need to check hw_len. it's calculated from address and len */ return true; } @@ -1274,16 +1277,6 @@ void show_user_instructions(struct pt_regs *regs) pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - /* - * Make sure the NIP points at userspace, not kernel text/data or - * elsewhere. - */ - if (!__access_ok(pc, NR_INSN_TO_PRINT * sizeof(int), USER_DS)) { - pr_info("%s[%d]: Bad NIP, not dumping instructions.\n", - current->comm, current->pid); - return; - } - seq_buf_init(&s, buf, sizeof(buf)); while (n) { @@ -1294,7 +1287,7 @@ void show_user_instructions(struct pt_regs *regs) for (i = 0; i < 8 && n; i++, n--, pc += sizeof(int)) { int instr; - if (probe_kernel_address((const void *)pc, instr)) { + if (probe_user_read(&instr, (void __user *)pc, sizeof(instr))) { seq_buf_printf(&s, "XXXXXXXX "); continue; } @@ -1600,8 +1593,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy architecture-specific thread state */ -int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long kthread_arg, struct task_struct *p) +int copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long kthread_arg, struct task_struct *p, + unsigned long tls) { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -1642,10 +1636,10 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_PPC64 if (!is_32bit_task()) - childregs->gpr[13] = childregs->gpr[6]; + childregs->gpr[13] = tls; else #endif - childregs->gpr[2] = childregs->gpr[6]; + childregs->gpr[2] = tls; } f = ret_from_fork; @@ -2046,10 +2040,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int count = 0; int firstframe = 1; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - struct ftrace_ret_stack *ret_stack; - extern void return_to_handler(void); - unsigned long rth = (unsigned long)return_to_handler; - int curr_frame = 0; + unsigned long ret_addr; + int ftrace_idx = 0; #endif if (tsk == NULL) @@ -2078,15 +2070,10 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) if (!firstframe || ip != lr) { printk("["REG"] ["REG"] %pS", sp, ip, (void *)ip); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - if ((ip == rth) && curr_frame >= 0) { - ret_stack = ftrace_graph_get_ret_stack(current, - curr_frame++); - if (ret_stack) - pr_cont(" (%pS)", - (void *)ret_stack->ret); - else - curr_frame = -1; - } + ret_addr = ftrace_graph_ret_addr(current, + &ftrace_idx, ip, stack); + if (ret_addr != ip) + pr_cont(" (%pS)", (void *)ret_addr); #endif if (firstframe) pr_cont(" (unreliable)"); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7159e791a70d..6620f37abe73 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -55,6 +55,7 @@ #include <asm/firmware.h> #include <asm/dt_cpu_ftrs.h> #include <asm/drmem.h> +#include <asm/ultravisor.h> #include <mm/mmu_decl.h> @@ -702,9 +703,12 @@ void __init early_init_devtree(void *params) #ifdef CONFIG_PPC_POWERNV /* Some machines might need OPAL info for debugging, grab it now. */ of_scan_flat_dt(early_init_dt_scan_opal, NULL); + + /* Scan tree for ultravisor feature */ + of_scan_flat_dt(early_init_dt_scan_ultravisor, NULL); #endif -#ifdef CONFIG_FA_DUMP +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* scan tree to see if dump is active during last boot */ of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif @@ -731,7 +735,7 @@ void __init early_init_devtree(void *params) if (PHYSICAL_START > MEMORY_START) memblock_reserve(MEMORY_START, 0x8000); reserve_kdump_trampoline(); -#ifdef CONFIG_FA_DUMP +#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* * If we fail to reserve memory for firmware-assisted dump then * fallback to kexec based kdump. diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 514707ef6779..577345382b23 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -40,6 +40,7 @@ #include <asm/sections.h> #include <asm/machdep.h> #include <asm/asm-prototypes.h> +#include <asm/ultravisor-api.h> #include <linux/linux_logo.h> @@ -94,7 +95,7 @@ static int of_workarounds __prombss; #define PROM_BUG() do { \ prom_printf("kernel BUG at %s line 0x%x!\n", \ __FILE__, __LINE__); \ - __asm__ __volatile__(".long " BUG_ILLEGAL_INSTR); \ + __builtin_trap(); \ } while (0) #ifdef DEBUG_PROM @@ -171,6 +172,10 @@ static bool __prombss prom_radix_disable; static bool __prombss prom_xive_disable; #endif +#ifdef CONFIG_PPC_SVM +static bool __prombss prom_svm_enable; +#endif + struct platform_support { bool hash_mmu; bool radix_mmu; @@ -298,16 +303,24 @@ static char __init *prom_strstr(const char *s1, const char *s2) return NULL; } -static size_t __init prom_strlcpy(char *dest, const char *src, size_t size) +static size_t __init prom_strlcat(char *dest, const char *src, size_t count) { - size_t ret = prom_strlen(src); + size_t dsize = prom_strlen(dest); + size_t len = prom_strlen(src); + size_t res = dsize + len; + + /* This would be a bug */ + if (dsize >= count) + return count; + + dest += dsize; + count -= dsize; + if (len >= count) + len = count-1; + memcpy(dest, src, len); + dest[len] = 0; + return res; - if (size) { - size_t len = (ret >= size) ? size - 1 : ret; - memcpy(dest, src, len); - dest[len] = '\0'; - } - return ret; } #ifdef CONFIG_PPC_PSERIES @@ -759,10 +772,14 @@ static void __init early_cmdline_parse(void) prom_cmd_line[0] = 0; p = prom_cmd_line; - if ((long)prom.chosen > 0) + + if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && (long)prom.chosen > 0) l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1); - if (IS_ENABLED(CONFIG_CMDLINE_BOOL) && (l <= 0 || p[0] == '\0')) /* dbl check */ - prom_strlcpy(prom_cmd_line, CONFIG_CMDLINE, sizeof(prom_cmd_line)); + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) || l <= 0 || p[0] == '\0') + prom_strlcat(prom_cmd_line, " " CONFIG_CMDLINE, + sizeof(prom_cmd_line)); + prom_printf("command line: %s\n", prom_cmd_line); #ifdef CONFIG_PPC64 @@ -812,6 +829,17 @@ static void __init early_cmdline_parse(void) prom_debug("XIVE disabled from cmdline\n"); } #endif /* CONFIG_PPC_PSERIES */ + +#ifdef CONFIG_PPC_SVM + opt = prom_strstr(prom_cmd_line, "svm="); + if (opt) { + bool val; + + opt += sizeof("svm=") - 1; + if (!prom_strtobool(opt, &val)) + prom_svm_enable = val; + } +#endif /* CONFIG_PPC_SVM */ } #ifdef CONFIG_PPC_PSERIES @@ -1037,7 +1065,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { .reserved2 = 0, .reserved3 = 0, .subprocessors = 1, - .byte22 = OV5_FEAT(OV5_DRMEM_V2), + .byte22 = OV5_FEAT(OV5_DRMEM_V2) | OV5_FEAT(OV5_DRC_INFO), .intarch = 0, .mmu = 0, .hash_ext = 0, @@ -1712,6 +1740,43 @@ static void __init prom_close_stdin(void) } } +#ifdef CONFIG_PPC_SVM +static int prom_rtas_hcall(uint64_t args) +{ + register uint64_t arg1 asm("r3") = H_RTAS; + register uint64_t arg2 asm("r4") = args; + + asm volatile("sc 1\n" : "=r" (arg1) : + "r" (arg1), + "r" (arg2) :); + return arg1; +} + +static struct rtas_args __prombss os_term_args; + +static void __init prom_rtas_os_term(char *str) +{ + phandle rtas_node; + __be32 val; + u32 token; + + prom_debug("%s: start...\n", __func__); + rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas")); + prom_debug("rtas_node: %x\n", rtas_node); + if (!PHANDLE_VALID(rtas_node)) + return; + + val = 0; + prom_getprop(rtas_node, "ibm,os-term", &val, sizeof(val)); + token = be32_to_cpu(val); + prom_debug("ibm,os-term: %x\n", token); + if (token == 0) + prom_panic("Could not get token for ibm,os-term\n"); + os_term_args.token = cpu_to_be32(token); + prom_rtas_hcall((uint64_t)&os_term_args); +} +#endif /* CONFIG_PPC_SVM */ + /* * Allocate room for and instantiate RTAS */ @@ -3168,6 +3233,59 @@ static void unreloc_toc(void) #endif #endif +#ifdef CONFIG_PPC_SVM +/* + * Perform the Enter Secure Mode ultracall. + */ +static int enter_secure_mode(unsigned long kbase, unsigned long fdt) +{ + register unsigned long r3 asm("r3") = UV_ESM; + register unsigned long r4 asm("r4") = kbase; + register unsigned long r5 asm("r5") = fdt; + + asm volatile("sc 2" : "+r"(r3) : "r"(r4), "r"(r5)); + + return r3; +} + +/* + * Call the Ultravisor to transfer us to secure memory if we have an ESM blob. + */ +static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +{ + int ret; + + if (!prom_svm_enable) + return; + + /* Switch to secure mode. */ + prom_printf("Switching to secure mode.\n"); + + /* + * The ultravisor will do an integrity check of the kernel image but we + * relocated it so the check will fail. Restore the original image by + * relocating it back to the kernel virtual base address. + */ + if (IS_ENABLED(CONFIG_RELOCATABLE)) + relocate(KERNELBASE); + + ret = enter_secure_mode(kbase, fdt); + + /* Relocate the kernel again. */ + if (IS_ENABLED(CONFIG_RELOCATABLE)) + relocate(kbase); + + if (ret != U_SUCCESS) { + prom_printf("Returned %d from switching to secure mode.\n", ret); + prom_rtas_os_term("Switch to secure mode failed.\n"); + } +} +#else +static void setup_secure_guest(unsigned long kbase, unsigned long fdt) +{ +} +#endif /* CONFIG_PPC_SVM */ + /* * We enter here early on, when the Open Firmware prom is still * handling exceptions and the MMU hash table for us. @@ -3366,6 +3484,9 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unreloc_toc(); #endif + /* Move to secure memory if we're supposed to be secure guests. */ + setup_secure_guest(kbase, hdr); + __start(hdr, kbase, 0, 0, 0, 0, 0); return 0; diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 160bef0d553d..b183ab9c5107 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -26,14 +26,15 @@ _end enter_prom $MEM_FUNCS reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start logo_linux_clut224 btext_prepare_BAT reloc_got2 kernstart_addr memstart_addr linux_banner _stext -__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." +__prom_init_toc_start __prom_init_toc_end btext_setup_display TOC. +relocate" NM="$1" OBJ="$2" ERROR=0 -function check_section() +check_section() { file=$1 section=$2 diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 8c92febf5f44..25c0424e8868 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -2425,7 +2425,8 @@ static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, return -EIO; hw_brk.address = data & (~HW_BRK_TYPE_DABR); hw_brk.type = (data & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL; - hw_brk.len = 8; + hw_brk.len = DABR_MAX_LEN; + hw_brk.hw_len = DABR_MAX_LEN; set_bp = (data) && (hw_brk.type & HW_BRK_TYPE_RDWR); #ifdef CONFIG_HAVE_HW_BREAKPOINT bp = thread->ptrace_bps[0]; @@ -2439,6 +2440,7 @@ static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, if (bp) { attr = bp->attr; attr.bp_addr = hw_brk.address; + attr.bp_len = DABR_MAX_LEN; arch_bp_generic_fields(hw_brk.type, &attr.bp_type); /* Enable breakpoint */ @@ -2456,7 +2458,7 @@ static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, /* Create a new breakpoint request if one doesn't exist already */ hw_breakpoint_init(&attr); attr.bp_addr = hw_brk.address; - attr.bp_len = 8; + attr.bp_len = DABR_MAX_LEN; arch_bp_generic_fields(hw_brk.type, &attr.bp_type); @@ -2880,18 +2882,14 @@ static long ppc_set_hwdebug(struct task_struct *child, if ((unsigned long)bp_info->addr >= TASK_SIZE) return -EIO; - brk.address = bp_info->addr & ~7UL; + brk.address = bp_info->addr & ~HW_BREAKPOINT_ALIGN; brk.type = HW_BRK_TYPE_TRANSLATE; - brk.len = 8; + brk.len = DABR_MAX_LEN; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ) brk.type |= HW_BRK_TYPE_READ; if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE) brk.type |= HW_BRK_TYPE_WRITE; #ifdef CONFIG_HAVE_HW_BREAKPOINT - /* - * Check if the request is for 'range' breakpoints. We can - * support it if range < 8 bytes. - */ if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) len = bp_info->addr2 - bp_info->addr; else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) @@ -2904,7 +2902,7 @@ static long ppc_set_hwdebug(struct task_struct *child, /* Create a new breakpoint request if one doesn't exist already */ hw_breakpoint_init(&attr); - attr.bp_addr = (unsigned long)bp_info->addr & ~HW_BREAKPOINT_ALIGN; + attr.bp_addr = (unsigned long)bp_info->addr; attr.bp_len = len; arch_bp_generic_fields(brk.type, &attr.bp_type); @@ -3361,6 +3359,12 @@ void do_syscall_trace_leave(struct pt_regs *regs) user_enter(); } +void __init pt_regs_check(void); + +/* + * Dummy function, its purpose is to break the build if struct pt_regs and + * struct user_pt_regs don't match. + */ void __init pt_regs_check(void) { BUILD_BUG_ON(offsetof(struct pt_regs, gpr) != @@ -3398,4 +3402,67 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, result)); BUILD_BUG_ON(sizeof(struct user_pt_regs) > sizeof(struct pt_regs)); + + // Now check that the pt_regs offsets match the uapi #defines + #define CHECK_REG(_pt, _reg) \ + BUILD_BUG_ON(_pt != (offsetof(struct user_pt_regs, _reg) / \ + sizeof(unsigned long))); + + CHECK_REG(PT_R0, gpr[0]); + CHECK_REG(PT_R1, gpr[1]); + CHECK_REG(PT_R2, gpr[2]); + CHECK_REG(PT_R3, gpr[3]); + CHECK_REG(PT_R4, gpr[4]); + CHECK_REG(PT_R5, gpr[5]); + CHECK_REG(PT_R6, gpr[6]); + CHECK_REG(PT_R7, gpr[7]); + CHECK_REG(PT_R8, gpr[8]); + CHECK_REG(PT_R9, gpr[9]); + CHECK_REG(PT_R10, gpr[10]); + CHECK_REG(PT_R11, gpr[11]); + CHECK_REG(PT_R12, gpr[12]); + CHECK_REG(PT_R13, gpr[13]); + CHECK_REG(PT_R14, gpr[14]); + CHECK_REG(PT_R15, gpr[15]); + CHECK_REG(PT_R16, gpr[16]); + CHECK_REG(PT_R17, gpr[17]); + CHECK_REG(PT_R18, gpr[18]); + CHECK_REG(PT_R19, gpr[19]); + CHECK_REG(PT_R20, gpr[20]); + CHECK_REG(PT_R21, gpr[21]); + CHECK_REG(PT_R22, gpr[22]); + CHECK_REG(PT_R23, gpr[23]); + CHECK_REG(PT_R24, gpr[24]); + CHECK_REG(PT_R25, gpr[25]); + CHECK_REG(PT_R26, gpr[26]); + CHECK_REG(PT_R27, gpr[27]); + CHECK_REG(PT_R28, gpr[28]); + CHECK_REG(PT_R29, gpr[29]); + CHECK_REG(PT_R30, gpr[30]); + CHECK_REG(PT_R31, gpr[31]); + CHECK_REG(PT_NIP, nip); + CHECK_REG(PT_MSR, msr); + CHECK_REG(PT_ORIG_R3, orig_gpr3); + CHECK_REG(PT_CTR, ctr); + CHECK_REG(PT_LNK, link); + CHECK_REG(PT_XER, xer); + CHECK_REG(PT_CCR, ccr); +#ifdef CONFIG_PPC64 + CHECK_REG(PT_SOFTE, softe); +#else + CHECK_REG(PT_MQ, mq); +#endif + CHECK_REG(PT_TRAP, trap); + CHECK_REG(PT_DAR, dar); + CHECK_REG(PT_DSISR, dsisr); + CHECK_REG(PT_RESULT, result); + #undef CHECK_REG + + BUILD_BUG_ON(PT_REGS_COUNT != sizeof(struct user_pt_regs) / sizeof(unsigned long)); + + /* + * PT_DSCR isn't a real reg, but it's important that it doesn't overlap the + * real registers. + */ + BUILD_BUG_ON(PT_DSCR < sizeof(struct user_pt_regs) / sizeof(unsigned long)); } diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index 487dcd8da4de..2d33f342a293 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -159,12 +159,12 @@ static int poweron_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_poweron_show, NULL); } -static const struct file_operations ppc_rtas_poweron_operations = { - .open = poweron_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_poweron_write, - .release = single_release, +static const struct proc_ops ppc_rtas_poweron_proc_ops = { + .proc_open = poweron_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_poweron_write, + .proc_release = single_release, }; static int progress_open(struct inode *inode, struct file *file) @@ -172,12 +172,12 @@ static int progress_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_progress_show, NULL); } -static const struct file_operations ppc_rtas_progress_operations = { - .open = progress_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_progress_write, - .release = single_release, +static const struct proc_ops ppc_rtas_progress_proc_ops = { + .proc_open = progress_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_progress_write, + .proc_release = single_release, }; static int clock_open(struct inode *inode, struct file *file) @@ -185,12 +185,12 @@ static int clock_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_clock_show, NULL); } -static const struct file_operations ppc_rtas_clock_operations = { - .open = clock_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_clock_write, - .release = single_release, +static const struct proc_ops ppc_rtas_clock_proc_ops = { + .proc_open = clock_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_clock_write, + .proc_release = single_release, }; static int tone_freq_open(struct inode *inode, struct file *file) @@ -198,12 +198,12 @@ static int tone_freq_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_tone_freq_show, NULL); } -static const struct file_operations ppc_rtas_tone_freq_operations = { - .open = tone_freq_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_tone_freq_write, - .release = single_release, +static const struct proc_ops ppc_rtas_tone_freq_proc_ops = { + .proc_open = tone_freq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_tone_freq_write, + .proc_release = single_release, }; static int tone_volume_open(struct inode *inode, struct file *file) @@ -211,12 +211,12 @@ static int tone_volume_open(struct inode *inode, struct file *file) return single_open(file, ppc_rtas_tone_volume_show, NULL); } -static const struct file_operations ppc_rtas_tone_volume_operations = { - .open = tone_volume_open, - .read = seq_read, - .llseek = seq_lseek, - .write = ppc_rtas_tone_volume_write, - .release = single_release, +static const struct proc_ops ppc_rtas_tone_volume_proc_ops = { + .proc_open = tone_volume_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = ppc_rtas_tone_volume_write, + .proc_release = single_release, }; static int ppc_rtas_find_all_sensors(void); @@ -238,17 +238,17 @@ static int __init proc_rtas_init(void) return -ENODEV; proc_create("powerpc/rtas/progress", 0644, NULL, - &ppc_rtas_progress_operations); + &ppc_rtas_progress_proc_ops); proc_create("powerpc/rtas/clock", 0644, NULL, - &ppc_rtas_clock_operations); + &ppc_rtas_clock_proc_ops); proc_create("powerpc/rtas/poweron", 0644, NULL, - &ppc_rtas_poweron_operations); + &ppc_rtas_poweron_proc_ops); proc_create_single("powerpc/rtas/sensors", 0444, NULL, ppc_rtas_sensors_show); proc_create("powerpc/rtas/frequency", 0644, NULL, - &ppc_rtas_tone_freq_operations); + &ppc_rtas_tone_freq_proc_ops); proc_create("powerpc/rtas/volume", 0644, NULL, - &ppc_rtas_tone_volume_operations); + &ppc_rtas_tone_volume_proc_ops); proc_create_single("powerpc/rtas/rmo_buffer", 0400, NULL, ppc_rtas_rmo_buf_show); return 0; diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 5faf0a64c92b..c5fa251b8950 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -16,6 +16,7 @@ #include <linux/capability.h> #include <linux/delay.h> #include <linux/cpu.h> +#include <linux/sched.h> #include <linux/smp.h> #include <linux/completion.h> #include <linux/cpumask.h> @@ -871,15 +872,17 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, return 0; for_each_cpu(cpu, cpus) { + struct device *dev = get_cpu_device(cpu); + switch (state) { case DOWN: - cpuret = cpu_down(cpu); + cpuret = device_offline(dev); break; case UP: - cpuret = cpu_up(cpu); + cpuret = device_online(dev); break; } - if (cpuret) { + if (cpuret < 0) { pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", __func__, ((state == UP) ? "up" : "down"), @@ -896,6 +899,7 @@ static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, cpumask_clear_cpu(cpu, cpus); } } + cond_resched(); } return ret; @@ -922,13 +926,11 @@ int rtas_online_cpus_mask(cpumask_var_t cpus) return ret; } -EXPORT_SYMBOL(rtas_online_cpus_mask); int rtas_offline_cpus_mask(cpumask_var_t cpus) { return rtas_cpu_state_change_mask(DOWN, cpus); } -EXPORT_SYMBOL(rtas_offline_cpus_mask); int rtas_ibm_suspend_me(u64 handle) { @@ -968,6 +970,8 @@ int rtas_ibm_suspend_me(u64 handle) data.token = rtas_token("ibm,suspend-me"); data.complete = &done; + lock_device_hotplug(); + /* All present CPUs must be online */ cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); cpuret = rtas_online_cpus_mask(offline_mask); @@ -1006,6 +1010,7 @@ out_hotplug_enable: __func__); out: + unlock_device_hotplug(); free_cpumask_var(offline_mask); return atomic_read(&data.error); } diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index 84f794782c62..a99179d83538 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -655,7 +655,7 @@ struct rtas_flash_file { const char *filename; const char *rtas_call_name; int *status; - const struct file_operations fops; + const struct proc_ops ops; }; static const struct rtas_flash_file rtas_flash_files[] = { @@ -663,36 +663,36 @@ static const struct rtas_flash_file rtas_flash_files[] = { .filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME, .rtas_call_name = "ibm,update-flash-64-and-reboot", .status = &rtas_update_flash_data.status, - .fops.read = rtas_flash_read_msg, - .fops.write = rtas_flash_write, - .fops.release = rtas_flash_release, - .fops.llseek = default_llseek, + .ops.proc_read = rtas_flash_read_msg, + .ops.proc_write = rtas_flash_write, + .ops.proc_release = rtas_flash_release, + .ops.proc_lseek = default_llseek, }, { .filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME, .rtas_call_name = "ibm,update-flash-64-and-reboot", .status = &rtas_update_flash_data.status, - .fops.read = rtas_flash_read_num, - .fops.write = rtas_flash_write, - .fops.release = rtas_flash_release, - .fops.llseek = default_llseek, + .ops.proc_read = rtas_flash_read_num, + .ops.proc_write = rtas_flash_write, + .ops.proc_release = rtas_flash_release, + .ops.proc_lseek = default_llseek, }, { .filename = "powerpc/rtas/" VALIDATE_FLASH_NAME, .rtas_call_name = "ibm,validate-flash-image", .status = &rtas_validate_flash_data.status, - .fops.read = validate_flash_read, - .fops.write = validate_flash_write, - .fops.release = validate_flash_release, - .fops.llseek = default_llseek, + .ops.proc_read = validate_flash_read, + .ops.proc_write = validate_flash_write, + .ops.proc_release = validate_flash_release, + .ops.proc_lseek = default_llseek, }, { .filename = "powerpc/rtas/" MANAGE_FLASH_NAME, .rtas_call_name = "ibm,manage-flash-image", .status = &rtas_manage_flash_data.status, - .fops.read = manage_flash_read, - .fops.write = manage_flash_write, - .fops.llseek = default_llseek, + .ops.proc_read = manage_flash_read, + .ops.proc_write = manage_flash_write, + .ops.proc_lseek = default_llseek, } }; @@ -723,7 +723,7 @@ static int __init rtas_flash_init(void) const struct rtas_flash_file *f = &rtas_flash_files[i]; int token; - if (!proc_create(f->filename, 0600, NULL, &f->fops)) + if (!proc_create(f->filename, 0600, NULL, &f->ops)) goto enomem; /* diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 8d02e047f96a..89b798f8f656 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -385,12 +385,12 @@ static __poll_t rtas_log_poll(struct file *file, poll_table * wait) return 0; } -static const struct file_operations proc_rtas_log_operations = { - .read = rtas_log_read, - .poll = rtas_log_poll, - .open = rtas_log_open, - .release = rtas_log_release, - .llseek = noop_llseek, +static const struct proc_ops rtas_log_proc_ops = { + .proc_read = rtas_log_read, + .proc_poll = rtas_log_poll, + .proc_open = rtas_log_open, + .proc_release = rtas_log_release, + .proc_lseek = noop_llseek, }; static int enable_surveillance(int timeout) @@ -572,7 +572,7 @@ static int __init rtas_init(void) return -ENODEV; entry = proc_create("powerpc/rtas/error_log", 0400, NULL, - &proc_rtas_log_operations); + &rtas_log_proc_ops); if (!entry) printk(KERN_ERR "Failed to create error_log proc entry\n"); diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c new file mode 100644 index 000000000000..4b982324d368 --- /dev/null +++ b/arch/powerpc/kernel/secure_boot.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Nayna Jain + */ +#include <linux/types.h> +#include <linux/of.h> +#include <asm/secure_boot.h> + +static struct device_node *get_ppc_fw_sb_node(void) +{ + static const struct of_device_id ids[] = { + { .compatible = "ibm,secureboot", }, + { .compatible = "ibm,secureboot-v1", }, + { .compatible = "ibm,secureboot-v2", }, + {}, + }; + + return of_find_matching_node(NULL, ids); +} + +bool is_ppc_secureboot_enabled(void) +{ + struct device_node *node; + bool enabled = false; + + node = get_ppc_fw_sb_node(); + enabled = of_property_read_bool(node, "os-secureboot-enforcing"); + + of_node_put(node); + + pr_info("Secure boot mode %s\n", enabled ? "enabled" : "disabled"); + + return enabled; +} + +bool is_ppc_trustedboot_enabled(void) +{ + struct device_node *node; + bool enabled = false; + + node = get_ppc_fw_sb_node(); + enabled = of_property_read_bool(node, "trusted-enabled"); + + of_node_put(node); + + pr_info("Trusted boot mode %s\n", enabled ? "enabled" : "disabled"); + + return enabled; +} diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index e1c9cf079503..bd70f5be1c27 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -16,7 +16,7 @@ #include <asm/setup.h> -unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT; +u64 powerpc_security_features __read_mostly = SEC_FTR_DEFAULT; enum count_cache_flush_type { COUNT_CACHE_FLUSH_NONE = 0x1, @@ -24,11 +24,12 @@ enum count_cache_flush_type { COUNT_CACHE_FLUSH_HW = 0x4, }; static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; +static bool link_stack_flush_enabled; bool barrier_nospec_enabled; static bool no_nospec; static bool btb_flush_enabled; -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static bool no_spectrev2; #endif @@ -94,13 +95,14 @@ static int barrier_nospec_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_barrier_nospec, - barrier_nospec_get, barrier_nospec_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get, + barrier_nospec_set, "%llu\n"); static __init int barrier_nospec_debugfs_init(void) { - debugfs_create_file("barrier_nospec", 0600, powerpc_debugfs_root, NULL, - &fops_barrier_nospec); + debugfs_create_file_unsafe("barrier_nospec", 0600, + powerpc_debugfs_root, NULL, + &fops_barrier_nospec); return 0; } device_initcall(barrier_nospec_debugfs_init); @@ -108,13 +110,13 @@ device_initcall(barrier_nospec_debugfs_init); static __init int security_feature_debugfs_init(void) { debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, - (u64 *)&powerpc_security_features); + &powerpc_security_features); return 0; } device_initcall(security_feature_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_PPC_FSL_BOOK3E +#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3S_64) static int __init handle_nospectre_v2(char *p) { no_spectrev2 = true; @@ -122,6 +124,9 @@ static int __init handle_nospectre_v2(char *p) return 0; } early_param("nospectre_v2", handle_nospectre_v2); +#endif /* CONFIG_PPC_FSL_BOOK3E || CONFIG_PPC_BOOK3S_64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E void setup_spectre_v2(void) { if (no_spectrev2 || cpu_mitigations_off()) @@ -138,32 +143,33 @@ ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, cha thread_priv = security_ftr_enabled(SEC_FTR_L1D_THREAD_PRIV); - if (rfi_flush || thread_priv) { + if (rfi_flush) { struct seq_buf s; seq_buf_init(&s, buf, PAGE_SIZE - 1); - seq_buf_printf(&s, "Mitigation: "); - - if (rfi_flush) - seq_buf_printf(&s, "RFI Flush"); - - if (rfi_flush && thread_priv) - seq_buf_printf(&s, ", "); - + seq_buf_printf(&s, "Mitigation: RFI Flush"); if (thread_priv) - seq_buf_printf(&s, "L1D private per thread"); + seq_buf_printf(&s, ", L1D private per thread"); seq_buf_printf(&s, "\n"); return s.len; } + if (thread_priv) + return sprintf(buf, "Vulnerable: L1D private per thread\n"); + if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR)) return sprintf(buf, "Not affected\n"); return sprintf(buf, "Vulnerable\n"); } + +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_meltdown(dev, attr, buf); +} #endif ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) @@ -209,11 +215,19 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c if (ccd) seq_buf_printf(&s, "Indirect branch cache disabled"); + + if (link_stack_flush_enabled) + seq_buf_printf(&s, ", Software link stack flush"); + } else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) { seq_buf_printf(&s, "Mitigation: Software count cache flush"); if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW) seq_buf_printf(&s, " (hardware accelerated)"); + + if (link_stack_flush_enabled) + seq_buf_printf(&s, ", Software link stack flush"); + } else if (btb_flush_enabled) { seq_buf_printf(&s, "Mitigation: Branch predictor state flush"); } else { @@ -364,28 +378,61 @@ static int stf_barrier_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, + "%llu\n"); static __init int stf_barrier_debugfs_init(void) { - debugfs_create_file("stf_barrier", 0600, powerpc_debugfs_root, NULL, &fops_stf_barrier); + debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root, + NULL, &fops_stf_barrier); return 0; } device_initcall(stf_barrier_debugfs_init); #endif /* CONFIG_DEBUG_FS */ +static void no_count_cache_flush(void) +{ + count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; + pr_info("count-cache-flush: software flush disabled.\n"); +} + static void toggle_count_cache_flush(bool enable) { - if (!enable || !security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE) && + !security_ftr_enabled(SEC_FTR_FLUSH_LINK_STACK)) + enable = false; + + if (!enable) { patch_instruction_site(&patch__call_flush_count_cache, PPC_INST_NOP); - count_cache_flush_type = COUNT_CACHE_FLUSH_NONE; - pr_info("count-cache-flush: software flush disabled.\n"); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + patch_instruction_site(&patch__call_kvm_flush_link_stack, PPC_INST_NOP); +#endif + pr_info("link-stack-flush: software flush disabled.\n"); + link_stack_flush_enabled = false; + no_count_cache_flush(); return; } + // This enables the branch from _switch to flush_count_cache patch_branch_site(&patch__call_flush_count_cache, (u64)&flush_count_cache, BRANCH_SET_LINK); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + // This enables the branch from guest_exit_cont to kvm_flush_link_stack + patch_branch_site(&patch__call_kvm_flush_link_stack, + (u64)&kvm_flush_link_stack, BRANCH_SET_LINK); +#endif + + pr_info("link-stack-flush: software flush enabled.\n"); + link_stack_flush_enabled = true; + + // If we just need to flush the link stack, patch an early return + if (!security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) { + patch_instruction_site(&patch__flush_link_stack_return, PPC_INST_BLR); + no_count_cache_flush(); + return; + } + if (!security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) { count_cache_flush_type = COUNT_CACHE_FLUSH_SW; pr_info("count-cache-flush: full software flush sequence enabled.\n"); @@ -399,7 +446,26 @@ static void toggle_count_cache_flush(bool enable) void setup_count_cache_flush(void) { - toggle_count_cache_flush(true); + bool enable = true; + + if (no_spectrev2 || cpu_mitigations_off()) { + if (security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED) || + security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED)) + pr_warn("Spectre v2 mitigations not fully under software control, can't disable\n"); + + enable = false; + } + + /* + * There's no firmware feature flag/hypervisor bit to tell us we need to + * flush the link stack on context switch. So we set it here if we see + * either of the Spectre v2 mitigations that aim to protect userspace. + */ + if (security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED) || + security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) + security_ftr_set(SEC_FTR_FLUSH_LINK_STACK); + + toggle_count_cache_flush(enable); } #ifdef CONFIG_DEBUG_FS @@ -429,13 +495,14 @@ static int count_cache_flush_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, - count_cache_flush_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, + count_cache_flush_set, "%llu\n"); static __init int count_cache_flush_debugfs_init(void) { - debugfs_create_file("count_cache_flush", 0600, powerpc_debugfs_root, - NULL, &fops_count_cache_flush); + debugfs_create_file_unsafe("count_cache_flush", 0600, + powerpc_debugfs_root, NULL, + &fops_count_cache_flush); return 0; } device_initcall(count_cache_flush_debugfs_init); diff --git a/arch/powerpc/kernel/secvar-ops.c b/arch/powerpc/kernel/secvar-ops.c new file mode 100644 index 000000000000..6a29777d6a2d --- /dev/null +++ b/arch/powerpc/kernel/secvar-ops.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Nayna Jain + * + * This file initializes secvar operations for PowerPC Secureboot + */ + +#include <linux/cache.h> +#include <asm/secvar.h> + +const struct secvar_operations *secvar_ops __ro_after_init; + +void set_secvar_ops(const struct secvar_operations *ops) +{ + secvar_ops = ops; +} diff --git a/arch/powerpc/kernel/secvar-sysfs.c b/arch/powerpc/kernel/secvar-sysfs.c new file mode 100644 index 000000000000..a0a78aba2083 --- /dev/null +++ b/arch/powerpc/kernel/secvar-sysfs.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 IBM Corporation <nayna@linux.ibm.com> + * + * This code exposes secure variables to user via sysfs + */ + +#define pr_fmt(fmt) "secvar-sysfs: "fmt + +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/string.h> +#include <linux/of.h> +#include <asm/secvar.h> + +#define NAME_MAX_SIZE 1024 + +static struct kobject *secvar_kobj; +static struct kset *secvar_kset; + +static ssize_t format_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + ssize_t rc = 0; + struct device_node *node; + const char *format; + + node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); + if (!of_device_is_available(node)) + return -ENODEV; + + rc = of_property_read_string(node, "format", &format); + if (rc) + return rc; + + rc = sprintf(buf, "%s\n", format); + + of_node_put(node); + + return rc; +} + + +static ssize_t size_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + uint64_t dsize; + int rc; + + rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize); + if (rc) { + pr_err("Error retrieving %s variable size %d\n", kobj->name, + rc); + return rc; + } + + return sprintf(buf, "%llu\n", dsize); +} + +static ssize_t data_read(struct file *filep, struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t off, + size_t count) +{ + uint64_t dsize; + char *data; + int rc; + + rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize); + if (rc) { + pr_err("Error getting %s variable size %d\n", kobj->name, rc); + return rc; + } + pr_debug("dsize is %llu\n", dsize); + + data = kzalloc(dsize, GFP_KERNEL); + if (!data) + return -ENOMEM; + + rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, data, &dsize); + if (rc) { + pr_err("Error getting %s variable %d\n", kobj->name, rc); + goto data_fail; + } + + rc = memory_read_from_buffer(buf, count, &off, data, dsize); + +data_fail: + kfree(data); + return rc; +} + +static ssize_t update_write(struct file *filep, struct kobject *kobj, + struct bin_attribute *attr, char *buf, loff_t off, + size_t count) +{ + int rc; + + pr_debug("count is %ld\n", count); + rc = secvar_ops->set(kobj->name, strlen(kobj->name) + 1, buf, count); + if (rc) { + pr_err("Error setting the %s variable %d\n", kobj->name, rc); + return rc; + } + + return count; +} + +static struct kobj_attribute format_attr = __ATTR_RO(format); + +static struct kobj_attribute size_attr = __ATTR_RO(size); + +static struct bin_attribute data_attr = __BIN_ATTR_RO(data, 0); + +static struct bin_attribute update_attr = __BIN_ATTR_WO(update, 0); + +static struct bin_attribute *secvar_bin_attrs[] = { + &data_attr, + &update_attr, + NULL, +}; + +static struct attribute *secvar_attrs[] = { + &size_attr.attr, + NULL, +}; + +static const struct attribute_group secvar_attr_group = { + .attrs = secvar_attrs, + .bin_attrs = secvar_bin_attrs, +}; +__ATTRIBUTE_GROUPS(secvar_attr); + +static struct kobj_type secvar_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = secvar_attr_groups, +}; + +static int update_kobj_size(void) +{ + + struct device_node *node; + u64 varsize; + int rc = 0; + + node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); + if (!of_device_is_available(node)) { + rc = -ENODEV; + goto out; + } + + rc = of_property_read_u64(node, "max-var-size", &varsize); + if (rc) + goto out; + + data_attr.size = varsize; + update_attr.size = varsize; + +out: + of_node_put(node); + + return rc; +} + +static int secvar_sysfs_load(void) +{ + char *name; + uint64_t namesize = 0; + struct kobject *kobj; + int rc; + + name = kzalloc(NAME_MAX_SIZE, GFP_KERNEL); + if (!name) + return -ENOMEM; + + do { + rc = secvar_ops->get_next(name, &namesize, NAME_MAX_SIZE); + if (rc) { + if (rc != -ENOENT) + pr_err("error getting secvar from firmware %d\n", + rc); + break; + } + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (!kobj) { + rc = -ENOMEM; + break; + } + + kobject_init(kobj, &secvar_ktype); + + rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name); + if (rc) { + pr_warn("kobject_add error %d for attribute: %s\n", rc, + name); + kobject_put(kobj); + kobj = NULL; + } + + if (kobj) + kobject_uevent(kobj, KOBJ_ADD); + + } while (!rc); + + kfree(name); + return rc; +} + +static int secvar_sysfs_init(void) +{ + int rc; + + if (!secvar_ops) { + pr_warn("secvar: failed to retrieve secvar operations.\n"); + return -ENODEV; + } + + secvar_kobj = kobject_create_and_add("secvar", firmware_kobj); + if (!secvar_kobj) { + pr_err("secvar: Failed to create firmware kobj\n"); + return -ENOMEM; + } + + rc = sysfs_create_file(secvar_kobj, &format_attr.attr); + if (rc) { + kobject_put(secvar_kobj); + return -ENOMEM; + } + + secvar_kset = kset_create_and_add("vars", NULL, secvar_kobj); + if (!secvar_kset) { + pr_err("secvar: sysfs kobject registration failed.\n"); + kobject_put(secvar_kobj); + return -ENOMEM; + } + + rc = update_kobj_size(); + if (rc) { + pr_err("Cannot read the size of the attribute\n"); + return rc; + } + + secvar_sysfs_load(); + + return 0; +} + +late_initcall(secvar_sysfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 1f8db666468d..7f8c890360fe 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -715,8 +715,28 @@ static struct notifier_block ppc_panic_block = { .priority = INT_MIN /* may not return; must be done last */ }; +/* + * Dump out kernel offset information on panic. + */ +static int dump_kernel_offset(struct notifier_block *self, unsigned long v, + void *p) +{ + pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", + kaslr_offset(), KERNELBASE); + + return 0; +} + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + void __init setup_panic(void) { + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset() > 0) + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + /* PPC64 always does a hard irq disable in its panic handler */ if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic) return; @@ -778,12 +798,6 @@ void ppc_printk_progress(char *s, unsigned short hex) pr_info("%s\n", s); } -void arch_setup_pdev_archdata(struct platform_device *pdev) -{ - pdev->archdata.dma_mask = DMA_BIT_MASK(32); - pdev->dev.dma_mask = &pdev->archdata.dma_mask; -} - static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); @@ -806,9 +820,15 @@ static __init void print_system_info(void) pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); #ifdef CONFIG_PPC64 pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); +#ifdef CONFIG_PPC_BOOK3S + pr_info("vmalloc start = 0x%lx\n", KERN_VIRT_START); + pr_info("IO start = 0x%lx\n", KERN_IO_START); + pr_info("vmemmap start = 0x%lx\n", (unsigned long)vmemmap); +#endif #endif - print_system_hash_info(); + if (!early_radix_enabled()) + print_system_hash_info(); if (PHYSICAL_START > 0) pr_info("physical_start = 0x%llx\n", @@ -929,9 +949,6 @@ void __init setup_arch(char **cmdline_p) early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); - if (IS_ENABLED(CONFIG_DUMMY_CONSOLE)) - conswitchp = &dummy_con; - if (ppc_md.setup_arch) ppc_md.setup_arch(); diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index c82577c4b15d..2dd0d9cb5a20 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -35,7 +35,7 @@ void exc_lvl_early_init(void); static inline void exc_lvl_early_init(void) { }; #endif -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) || defined(CONFIG_VMAP_STACK) void emergency_stack_init(void); #else static inline void emergency_stack_init(void) { }; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 94517e4a2723..5b49b26eb154 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -44,6 +44,7 @@ #include <asm/asm-prototypes.h> #include <asm/kdump.h> #include <asm/feature-fixups.h> +#include <asm/early_ioremap.h> #include "setup.h" @@ -80,6 +81,8 @@ notrace void __init machine_init(u64 dt_ptr) /* Configure static keys first, now that we're relocated. */ setup_feature_keys(); + early_ioremap_setup(); + /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); @@ -137,7 +140,7 @@ arch_initcall(ppc_init); static void *__init alloc_stack(void) { - void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN); if (!ptr) panic("cannot allocate %d bytes for stack at %pS\n", @@ -150,6 +153,9 @@ void __init irqstack_early_init(void) { unsigned int i; + if (IS_ENABLED(CONFIG_VMAP_STACK)) + return; + /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by default */ for_each_possible_cpu(i) { @@ -158,6 +164,18 @@ void __init irqstack_early_init(void) } } +#ifdef CONFIG_VMAP_STACK +void *emergency_ctx[NR_CPUS] __ro_after_init; + +void __init emergency_stack_init(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + emergency_ctx[i] = alloc_stack(); +} +#endif + #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) void __init exc_lvl_early_init(void) { @@ -206,6 +224,6 @@ __init void initialize_cache_info(void) dcache_bsize = cur_cpu_spec->dcache_bsize; icache_bsize = cur_cpu_spec->icache_bsize; ucache_bsize = 0; - if (cpu_has_feature(CPU_FTR_UNIFIED_ID_CACHE)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S_601) || IS_ENABLED(CONFIG_E200)) ucache_bsize = icache_bsize = dcache_bsize; } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 44b4c432a273..e05e6dd67ae6 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -65,15 +65,10 @@ #include <asm/hw_irq.h> #include <asm/feature-fixups.h> #include <asm/kup.h> +#include <asm/early_ioremap.h> #include "setup.h" -#ifdef DEBUG -#define DBG(fmt...) udbg_printf(fmt) -#else -#define DBG(fmt...) -#endif - int spinning_secondaries; u64 ppc64_pft_size; @@ -305,7 +300,7 @@ void __init early_setup(unsigned long dt_ptr) /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); - DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr); + udbg_printf(" -> %s(), dt_ptr: 0x%lx\n", __func__, dt_ptr); /* * Do early initialization using the flattened device @@ -338,6 +333,8 @@ void __init early_setup(unsigned long dt_ptr) apply_feature_fixups(); setup_feature_keys(); + early_ioremap_setup(); + /* Initialize the hash table or TLB handling */ early_init_mmu(); @@ -362,11 +359,11 @@ void __init early_setup(unsigned long dt_ptr) */ this_cpu_enable_ftrace(); - DBG(" <- early_setup()\n"); + udbg_printf(" <- %s()\n", __func__); #ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX /* - * This needs to be done *last* (after the above DBG() even) + * This needs to be done *last* (after the above udbg_printf() even) * * Right after we return from this function, we turn on the MMU * which means the real-mode access trick that btext does will @@ -436,8 +433,6 @@ void smp_release_cpus(void) if (!use_spinloop()) return; - DBG(" -> smp_release_cpus()\n"); - /* All secondary cpus are spinning on a common spinloop, release them * all now so they can start to spin on their individual paca * spinloops. For non SMP kernels, the secondary cpus never get out @@ -456,9 +451,7 @@ void smp_release_cpus(void) break; udelay(1); } - DBG("spinning_secondaries = %d\n", spinning_secondaries); - - DBG(" <- smp_release_cpus()\n"); + pr_debug("spinning_secondaries = %d\n", spinning_secondaries); } #endif /* CONFIG_SMP || CONFIG_KEXEC_CORE */ @@ -551,8 +544,6 @@ void __init initialize_cache_info(void) struct device_node *cpu = NULL, *l2, *l3 = NULL; u32 pvr; - DBG(" -> initialize_cache_info()\n"); - /* * All shipping POWER8 machines have a firmware bug that * puts incorrect information in the device-tree. This will @@ -576,10 +567,10 @@ void __init initialize_cache_info(void) */ if (cpu) { if (!parse_cache_info(cpu, false, &ppc64_caches.l1d)) - DBG("Argh, can't find dcache properties !\n"); + pr_warn("Argh, can't find dcache properties !\n"); if (!parse_cache_info(cpu, true, &ppc64_caches.l1i)) - DBG("Argh, can't find icache properties !\n"); + pr_warn("Argh, can't find icache properties !\n"); /* * Try to find the L2 and L3 if any. Assume they are @@ -604,8 +595,6 @@ void __init initialize_cache_info(void) cur_cpu_spec->dcache_bsize = dcache_bsize; cur_cpu_spec->icache_bsize = icache_bsize; - - DBG(" <- initialize_cache_info()\n"); } /* @@ -644,7 +633,7 @@ static void *__init alloc_stack(unsigned long limit, int cpu) BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); - ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, + ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_ALIGN, MEMBLOCK_LOW_LIMIT, limit, early_cpu_to_node(cpu)); if (!ptr) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 1e2276963f6d..e2a46cfed5fd 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -182,7 +182,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, * FIXME: IMHO these tests do not belong in * arch-dependent code, they are generic. */ - ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, NULL); + ip = ftrace_graph_ret_addr(tsk, &graph_idx, ip, stack); #ifdef CONFIG_KPROBES /* * Mark stacktraces with kretprobed functions on them diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 3bfb3888e897..078608ec2e92 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -79,7 +79,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, * sys_select() with the appropriate args. -- Cort */ int -ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) +ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct __kernel_old_timeval __user *tvp) { if ( (unsigned long)n >= 4096 ) { @@ -89,7 +89,7 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s || __get_user(inp, ((fd_set __user * __user *)(buffer+1))) || __get_user(outp, ((fd_set __user * __user *)(buffer+2))) || __get_user(exp, ((fd_set __user * __user *)(buffer+3))) - || __get_user(tvp, ((struct timeval __user * __user *)(buffer+4)))) + || __get_user(tvp, ((struct __kernel_old_timeval __user * __user *)(buffer+4)))) return -EFAULT; } return sys_select(n, inp, outp, exp, tvp); diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 43f736ed47f2..35b61bfc1b1a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -517,3 +517,5 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 nospu clone3 ppc_clone3 +437 common openat2 sys_openat2 +438 common pidfd_getfd sys_pidfd_getfd diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e2147d7c9e72..80a676da11cb 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -19,6 +19,7 @@ #include <asm/smp.h> #include <asm/pmc.h> #include <asm/firmware.h> +#include <asm/svm.h> #include "cacheinfo.h" #include "setup.h" @@ -715,6 +716,23 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ +#ifdef CONFIG_PPC_SVM +static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", is_secure_guest()); +} +static DEVICE_ATTR(svm, 0444, show_svm, NULL); + +static void create_svm_file(void) +{ + device_create_file(cpu_subsys.dev_root, &dev_attr_svm); +} +#else +static void create_svm_file(void) +{ +} +#endif /* CONFIG_PPC_SVM */ + static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -1058,6 +1076,8 @@ static int __init topology_init(void) sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ + create_svm_file(); + return 0; } subsys_initcall(topology_init); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308cd5..1168e8b37e30 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -232,7 +232,7 @@ static u64 scan_dispatch_log(u64 stop_tb) * Accumulate stolen time by scanning the dispatch trace log. * Called on entry from user mode. */ -void accumulate_stolen_time(void) +void notrace accumulate_stolen_time(void) { u64 sst, ust; unsigned long save_irq_soft_mask = irq_soft_mask_return(); @@ -338,7 +338,7 @@ static unsigned long vtime_delta(struct task_struct *tsk, return stime; } -void vtime_account_system(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); @@ -366,7 +366,7 @@ void vtime_account_system(struct task_struct *tsk) #endif } } -EXPORT_SYMBOL_GPL(vtime_account_system); +EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_idle(struct task_struct *tsk) { @@ -395,7 +395,7 @@ static void vtime_flush_scaled(struct task_struct *tsk, /* * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. - * Assumes that vtime_account_system/idle() has been called + * Assumes that vtime_account_kernel/idle() has been called * recently (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ @@ -885,7 +885,7 @@ static notrace u64 timebase_read(struct clocksource *cs) void update_vsyscall(struct timekeeper *tk) { - struct timespec xt; + struct timespec64 xt; struct clocksource *clock = tk->tkr_mono.clock; u32 mult = tk->tkr_mono.mult; u32 shift = tk->tkr_mono.shift; @@ -957,8 +957,10 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->tb_to_xs = new_tb_to_xs; vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; - vdso_data->stamp_xtime = xt; + vdso_data->stamp_xtime_sec = xt.tv_sec; + vdso_data->stamp_xtime_nsec = xt.tv_nsec; vdso_data->stamp_sec_fraction = frac_sec; + vdso_data->hrtimer_res = hrtimer_resolution; smp_wmb(); ++(vdso_data->tb_update_count); } diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index be1ca98fce5c..7ea0ca044b65 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -944,7 +944,8 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. Return the address we want to divert to. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip, + unsigned long sp) { unsigned long return_hooker; @@ -956,7 +957,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) return_hooker = ppc_function_entry(return_to_handler); - if (!function_graph_enter(parent, ip, 0, NULL)) + if (!function_graph_enter(parent, ip, 0, (unsigned long *)sp)) parent = return_hooker; out: return parent; diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S index 183f608efb81..e023ae59c429 100644 --- a/arch/powerpc/kernel/trace/ftrace_32.S +++ b/arch/powerpc/kernel/trace/ftrace_32.S @@ -50,6 +50,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addi r5, r1, 48 /* load r4 with local address */ lwz r4, 44(r1) subi r4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 74acbf16a666..f9fd5f743eba 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -294,6 +294,7 @@ _GLOBAL(ftrace_graph_caller) std r2, 24(r1) ld r2, PACATOC(r13) /* get kernel TOC in r2 */ + addi r5, r1, 112 mfctr r4 /* ftrace_caller has moved local addr here */ std r4, 40(r1) mflr r3 /* ftrace_caller has restored LR from stack */ diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S index e41a7d13c99c..6708e24db0ab 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.S +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S @@ -41,6 +41,7 @@ _GLOBAL(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER _GLOBAL(ftrace_graph_caller) + addi r5, r1, 112 /* load r4 with local address */ ld r4, 128(r1) subi r4, r4, MCOUNT_INSN_SIZE diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11caa0291254..82a3438300fd 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -250,15 +250,22 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, } NOKPROBE_SYMBOL(oops_end); +static char *get_mmu_str(void) +{ + if (early_radix_enabled()) + return " MMU=Radix"; + if (early_mmu_has_feature(MMU_FTR_HPTE_TABLE)) + return " MMU=Hash"; + return ""; +} + static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n", + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", - PAGE_SIZE / 1024, - early_radix_enabled() ? " MMU=Radix" : "", - early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "", + PAGE_SIZE / 1024, get_mmu_str(), IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", @@ -472,6 +479,7 @@ void system_reset_exception(struct pt_regs *regs) if (debugger(regs)) goto out; + kmsg_dump(KMSG_DUMP_OOPS); /* * A system reset is a request to dump, so we always send * it through the crashdump code (if fadump or kdump are @@ -1629,6 +1637,15 @@ void StackOverflow(struct pt_regs *regs) panic("kernel stack overflow"); } +void stack_overflow_exception(struct pt_regs *regs) +{ + enum ctx_state prev_state = exception_enter(); + + die("Kernel stack overflow", regs, SIGSEGV); + + exception_exit(prev_state); +} + void kernel_fp_unavailable_exception(struct pt_regs *regs) { enum ctx_state prev_state = exception_enter(); diff --git a/arch/powerpc/kernel/ucall.S b/arch/powerpc/kernel/ucall.S new file mode 100644 index 000000000000..07296bc39166 --- /dev/null +++ b/arch/powerpc/kernel/ucall.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Generic code to perform an ultravisor call. + * + * Copyright 2019, IBM Corporation. + * + */ +#include <asm/ppc_asm.h> +#include <asm/export.h> + +_GLOBAL(ucall_norets) +EXPORT_SYMBOL_GPL(ucall_norets) + sc 2 /* Invoke the ultravisor */ + blr /* Return r3 = status */ diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index a384e7c8b01c..01595e8cafe7 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -120,13 +120,15 @@ int udbg_write(const char *s, int n) #define UDBG_BUFSIZE 256 void udbg_printf(const char *fmt, ...) { - char buf[UDBG_BUFSIZE]; - va_list args; + if (udbg_putc) { + char buf[UDBG_BUFSIZE]; + va_list args; - va_start(args, fmt); - vsnprintf(buf, UDBG_BUFSIZE, fmt, args); - udbg_puts(buf); - va_end(args); + va_start(args, fmt); + vsnprintf(buf, UDBG_BUFSIZE, fmt, args); + udbg_puts(buf); + va_end(args); + } } void __init udbg_progress(char *s, unsigned short hex) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d60598113a9f..b9a108411c0d 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -94,28 +94,6 @@ static struct vdso_patch_def vdso_patches[] = { CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE, "__kernel_sync_dicache", "__kernel_sync_dicache_p5" }, -#ifdef CONFIG_PPC32 - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_gettimeofday", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_clock_gettime", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_clock_getres", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_get_tbfreq", NULL - }, - { - CPU_FTR_USE_RTC, CPU_FTR_USE_RTC, - "__kernel_time", NULL - }, -#endif }; /* @@ -750,11 +728,6 @@ static int __init vdso_init(void) */ vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT; DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages); -#else - vdso_data->dcache_block_size = L1_CACHE_BYTES; - vdso_data->dcache_log_block_size = L1_CACHE_SHIFT; - vdso_data->icache_block_size = L1_CACHE_BYTES; - vdso_data->icache_log_block_size = L1_CACHE_SHIFT; #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 06f54d947057..e147bbdc12cd 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -2,9 +2,7 @@ # List of files in the vdso, has to be asm only for now -obj-vdso32-$(CONFIG_PPC64) = getcpu.o -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ - $(obj-vdso32-y) +obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o # Build rules diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S index 7f882e7b9f43..3440ddf21c8b 100644 --- a/arch/powerpc/kernel/vdso32/cacheflush.S +++ b/arch/powerpc/kernel/vdso32/cacheflush.S @@ -8,7 +8,9 @@ #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/vdso.h> +#include <asm/vdso_datapage.h> #include <asm/asm-offsets.h> +#include <asm/cache.h> .text @@ -22,42 +24,62 @@ */ V_FUNCTION_BEGIN(__kernel_sync_dicache) .cfi_startproc +#ifdef CONFIG_PPC64 mflr r12 .cfi_register lr,r12 - mr r11,r3 - bl __get_datapage@local + get_datapage r10, r0 mtlr r12 - mr r10,r3 +#endif +#ifdef CONFIG_PPC64 lwz r7,CFG_DCACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ +#else + li r5, L1_CACHE_BYTES - 1 +#endif + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ +#ifdef CONFIG_PPC64 lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) srw. r8,r8,r9 /* compute line count */ +#else + srwi. r8, r8, L1_CACHE_SHIFT + mr r7, r6 +#endif crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 1: dcbst 0,r6 +#ifdef CONFIG_PPC64 add r6,r6,r7 +#else + addi r6, r6, L1_CACHE_BYTES +#endif bdnz 1b sync /* Now invalidate the instruction cache */ +#ifdef CONFIG_PPC64 lwz r7,CFG_ICACHE_BLOCKSZ(r10) addi r5,r7,-1 - andc r6,r11,r5 /* round low to line bdy */ + andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) srw. r8,r8,r9 /* compute line count */ crclr cr0*4+so beqlr /* nothing to do? */ +#endif mtctr r8 +#ifdef CONFIG_PPC64 2: icbi 0,r6 add r6,r6,r7 +#else +2: icbi 0, r7 + addi r7, r7, L1_CACHE_BYTES +#endif bdnz 2b isync li r3,0 diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S index 6984125b9fc0..217bb630f8f9 100644 --- a/arch/powerpc/kernel/vdso32/datapage.S +++ b/arch/powerpc/kernel/vdso32/datapage.S @@ -10,35 +10,13 @@ #include <asm/asm-offsets.h> #include <asm/unistd.h> #include <asm/vdso.h> +#include <asm/vdso_datapage.h> .text .global __kernel_datapage_offset; __kernel_datapage_offset: .long 0 -V_FUNCTION_BEGIN(__get_datapage) - .cfi_startproc - /* We don't want that exposed or overridable as we want other objects - * to be able to bl directly to here - */ - .protected __get_datapage - .hidden __get_datapage - - mflr r0 - .cfi_register lr,r0 - - bcl 20,31,data_page_branch -data_page_branch: - mflr r3 - mtlr r0 - addi r3, r3, __kernel_datapage_offset-data_page_branch - lwz r0,0(r3) - .cfi_restore lr - add r3,r0,r3 - blr - .cfi_endproc -V_FUNCTION_END(__get_datapage) - /* * void *__kernel_get_syscall_map(unsigned int *syscall_count) ; * @@ -52,11 +30,10 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map) .cfi_startproc mflr r12 .cfi_register lr,r12 - mr r4,r3 - bl __get_datapage@local + mr. r4,r3 + get_datapage r3, r0 mtlr r12 addi r3,r3,CFG_SYSCALL_MAP32 - cmpli cr0,r4,0 beqlr li r0,NR_syscalls stw r0,0(r4) @@ -70,11 +47,12 @@ V_FUNCTION_END(__kernel_get_syscall_map) * * returns the timebase frequency in HZ */ +#ifndef CONFIG_PPC_BOOK3S_601 V_FUNCTION_BEGIN(__kernel_get_tbfreq) .cfi_startproc mflr r12 .cfi_register lr,r12 - bl __get_datapage@local + get_datapage r3, r0 lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3) lwz r3,CFG_TB_TICKS_PER_SEC(r3) mtlr r12 @@ -82,3 +60,4 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq) blr .cfi_endproc V_FUNCTION_END(__kernel_get_tbfreq) +#endif diff --git a/arch/powerpc/kernel/vdso32/getcpu.S b/arch/powerpc/kernel/vdso32/getcpu.S index 63e914539e1a..ff5e214fec41 100644 --- a/arch/powerpc/kernel/vdso32/getcpu.S +++ b/arch/powerpc/kernel/vdso32/getcpu.S @@ -15,6 +15,7 @@ * int __kernel_getcpu(unsigned *cpu, unsigned *node); * */ +#if defined(CONFIG_PPC64) V_FUNCTION_BEGIN(__kernel_getcpu) .cfi_startproc mfspr r5,SPRN_SPRG_VDSO_READ @@ -24,10 +25,26 @@ V_FUNCTION_BEGIN(__kernel_getcpu) rlwinm r7,r5,16,31-15,31-0 beq cr0,1f stw r6,0(r3) -1: beq cr1,2f - stw r7,0(r4) -2: crclr cr0*4+so +1: crclr cr0*4+so li r3,0 /* always success */ + beqlr cr1 + stw r7,0(r4) + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) +#elif !defined(CONFIG_SMP) +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + cmpwi cr0, r3, 0 + cmpwi cr1, r4, 0 + li r5, 0 + beq cr0, 1f + stw r5, 0(r3) +1: li r3, 0 /* always success */ + crclr cr0*4+so + beqlr cr1 + stw r5, 0(r4) blr .cfi_endproc V_FUNCTION_END(__kernel_getcpu) +#endif diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index becd9f8767ed..a3951567118a 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -9,16 +9,15 @@ #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/vdso.h> +#include <asm/vdso_datapage.h> #include <asm/asm-offsets.h> #include <asm/unistd.h> /* Offset for the low 32-bit part of a field of long type */ #ifdef CONFIG_PPC64 #define LOPART 4 -#define TSPEC_TV_SEC TSPC64_TV_SEC+LOPART #else #define LOPART 0 -#define TSPEC_TV_SEC TSPC32_TV_SEC #endif .text @@ -33,28 +32,26 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) mflr r12 .cfi_register lr,r12 - mr r10,r3 /* r10 saves tv */ + mr. r10,r3 /* r10 saves tv */ mr r11,r4 /* r11 saves tz */ - bl __get_datapage@local /* get data page */ - mr r9, r3 /* datapage ptr in r9 */ - cmplwi r10,0 /* check if tv is NULL */ + get_datapage r9, r0 beq 3f - lis r7,1000000@ha /* load up USEC_PER_SEC */ - addi r7,r7,1000000@l /* so we get microseconds in r4 */ + LOAD_REG_IMMEDIATE(r7, 1000000) /* load up USEC_PER_SEC */ bl __do_get_tspec@local /* get sec/usec from tb & kernel */ stw r3,TVAL32_TV_SEC(r10) stw r4,TVAL32_TV_USEC(r10) 3: cmplwi r11,0 /* check if tz is NULL */ - beq 1f + mtlr r12 + crclr cr0*4+so + li r3,0 + beqlr + lwz r4,CFG_TZ_MINUTEWEST(r9)/* fill tz */ lwz r5,CFG_TZ_DSTTIME(r9) stw r4,TZONE_TZ_MINWEST(r11) stw r5,TZONE_TZ_DSTTIME(r11) -1: mtlr r12 - crclr cr0*4+so - li r3,0 blr .cfi_endproc V_FUNCTION_END(__kernel_gettimeofday) @@ -71,17 +68,23 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) cmpli cr0,r3,CLOCK_REALTIME cmpli cr1,r3,CLOCK_MONOTONIC cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f + + cmpli cr5,r3,CLOCK_REALTIME_COARSE + cmpli cr6,r3,CLOCK_MONOTONIC_COARSE + cror cr5*4+eq,cr5*4+eq,cr6*4+eq + + cror cr0*4+eq,cr0*4+eq,cr5*4+eq + bne cr0, .Lgettime_fallback mflr r12 /* r12 saves lr */ .cfi_register lr,r12 mr r11,r4 /* r11 saves tp */ - bl __get_datapage@local /* get data page */ - mr r9,r3 /* datapage ptr in r9 */ - lis r7,NSEC_PER_SEC@h /* want nanoseconds */ - ori r7,r7,NSEC_PER_SEC@l -50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ - bne cr1,80f /* not monotonic -> all done */ + get_datapage r9, r0 + LOAD_REG_IMMEDIATE(r7, NSEC_PER_SEC) /* load up NSEC_PER_SEC */ + beq cr5, .Lcoarse_clocks +.Lprecise_clocks: + bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ + bne cr1, .Lfinish /* not monotonic -> all done */ /* * CLOCK_MONOTONIC @@ -105,12 +108,53 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) add r9,r9,r0 lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) cmpl cr0,r8,r0 /* check if updated */ - bne- 50b + bne- .Lprecise_clocks + b .Lfinish_monotonic + + /* + * For coarse clocks we get data directly from the vdso data page, so + * we don't need to call __do_get_tspec, but we still need to do the + * counter trick. + */ +.Lcoarse_clocks: + lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) + andi. r0,r8,1 /* pending update ? loop */ + bne- .Lcoarse_clocks + add r9,r9,r0 /* r0 is already 0 */ + + /* + * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE + * too + */ + lwz r3,STAMP_XTIME_SEC+LOPART(r9) + lwz r4,STAMP_XTIME_NSEC+LOPART(r9) + bne cr6,1f + + /* CLOCK_MONOTONIC_COARSE */ + lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9) + lwz r6,WTOM_CLOCK_NSEC(r9) + + /* check if counter has updated */ + or r0,r6,r5 +1: or r0,r0,r3 + or r0,r0,r4 + xor r0,r0,r0 + add r3,r3,r0 + lwz r0,CFG_TB_UPDATE_COUNT+LOPART(r9) + cmpl cr0,r0,r8 /* check if updated */ + bne- .Lcoarse_clocks + + /* Counter has not updated, so continue calculating proper values for + * sec and nsec if monotonic coarse, or just return with the proper + * values for realtime. + */ + bne cr6, .Lfinish /* Calculate and store result. Note that this mimics the C code, * which may cause funny results if nsec goes negative... is that * possible at all ? */ +.Lfinish_monotonic: add r3,r3,r5 add r4,r4,r6 cmpw cr0,r4,r7 @@ -118,11 +162,12 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) blt 1f subf r4,r7,r4 addi r3,r3,1 -1: bge cr1,80f +1: bge cr1, .Lfinish addi r3,r3,-1 add r4,r4,r7 -80: stw r3,TSPC32_TV_SEC(r11) +.Lfinish: + stw r3,TSPC32_TV_SEC(r11) stw r4,TSPC32_TV_NSEC(r11) mtlr r12 @@ -133,7 +178,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) /* * syscall fallback */ -99: +.Lgettime_fallback: li r0,__NR_clock_gettime .cfi_restore lr sc @@ -151,27 +196,33 @@ V_FUNCTION_END(__kernel_clock_gettime) V_FUNCTION_BEGIN(__kernel_clock_getres) .cfi_startproc /* Check for supported clock IDs */ - cmpwi cr0,r3,CLOCK_REALTIME - cmpwi cr1,r3,CLOCK_MONOTONIC - cror cr0*4+eq,cr0*4+eq,cr1*4+eq - bne cr0,99f + cmplwi cr0, r3, CLOCK_MAX + cmpwi cr1, r3, CLOCK_REALTIME_COARSE + cmpwi cr7, r3, CLOCK_MONOTONIC_COARSE + bgt cr0, 99f + LOAD_REG_IMMEDIATE(r5, KTIME_LOW_RES) + beq cr1, 1f + beq cr7, 1f - li r3,0 + mflr r12 + .cfi_register lr,r12 + get_datapage r3, r0 + lwz r5, CLOCK_HRTIMER_RES(r3) + mtlr r12 +1: li r3,0 cmpli cr0,r4,0 crclr cr0*4+so beqlr - lis r5,CLOCK_REALTIME_RES@h - ori r5,r5,CLOCK_REALTIME_RES@l stw r3,TSPC32_TV_SEC(r4) stw r5,TSPC32_TV_NSEC(r4) blr /* - * syscall fallback + * invalid clock */ 99: - li r0,__NR_clock_getres - sc + li r3, EINVAL + crset so blr .cfi_endproc V_FUNCTION_END(__kernel_clock_getres) @@ -189,16 +240,15 @@ V_FUNCTION_BEGIN(__kernel_time) .cfi_register lr,r12 mr r11,r3 /* r11 holds t */ - bl __get_datapage@local - mr r9, r3 /* datapage ptr in r9 */ + get_datapage r9, r0 - lwz r3,STAMP_XTIME+TSPEC_TV_SEC(r9) + lwz r3,STAMP_XTIME_SEC+LOPART(r9) cmplwi r11,0 /* check if t is NULL */ - beq 2f - stw r3,0(r11) /* store result at *t */ -2: mtlr r12 + mtlr r12 crclr cr0*4+so + beqlr + stw r3,0(r11) /* store result at *t */ blr .cfi_endproc V_FUNCTION_END(__kernel_time) @@ -268,7 +318,7 @@ __do_get_tspec: * as a 32.32 fixed-point number in r3 and r4. * Load & add the xtime stamp. */ - lwz r5,STAMP_XTIME+TSPEC_TV_SEC(r9) + lwz r5,STAMP_XTIME_SEC+LOPART(r9) lwz r6,STAMP_SEC_FRAC(r9) addc r4,r4,r6 adde r3,r3,r5 diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 099a6db14e67..5206c2eb2a1d 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -144,18 +144,20 @@ VERSION __kernel_datapage_offset; __kernel_get_syscall_map; +#ifndef CONFIG_PPC_BOOK3S_601 __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; + __kernel_time; __kernel_get_tbfreq; +#endif __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp32; __kernel_sigtramp_rt32; -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC64) || !defined(CONFIG_SMP) __kernel_getcpu; #endif - __kernel_time; local: *; }; diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S index 3f92561a64c4..526f5ba2593e 100644 --- a/arch/powerpc/kernel/vdso64/cacheflush.S +++ b/arch/powerpc/kernel/vdso64/cacheflush.S @@ -35,7 +35,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ lwz r9,CFG_DCACHE_LOGBLOCKSZ(r10) - srw. r8,r8,r9 /* compute line count */ + srd. r8,r8,r9 /* compute line count */ crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 @@ -52,7 +52,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache) subf r8,r6,r4 /* compute length */ add r8,r8,r5 lwz r9,CFG_ICACHE_LOGBLOCKSZ(r10) - srw. r8,r8,r9 /* compute line count */ + srd. r8,r8,r9 /* compute line count */ crclr cr0*4+so beqlr /* nothing to do? */ mtctr r8 diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S index 07bfe33fe874..1c9a04703250 100644 --- a/arch/powerpc/kernel/vdso64/gettimeofday.S +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S @@ -116,8 +116,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE * too */ - ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) - ld r5,STAMP_XTIME+TSPC64_TV_NSEC(r3) + ld r4,STAMP_XTIME_SEC(r3) + ld r5,STAMP_XTIME_NSEC(r3) bne cr6,75f /* CLOCK_MONOTONIC_COARSE */ @@ -186,12 +186,15 @@ V_FUNCTION_BEGIN(__kernel_clock_getres) cror cr0*4+eq,cr0*4+eq,cr1*4+eq bne cr0,99f + mflr r12 + .cfi_register lr,r12 + bl V_LOCAL_FUNC(__get_datapage) + lwz r5, CLOCK_HRTIMER_RES(r3) + mtlr r12 li r3,0 cmpldi cr0,r4,0 crclr cr0*4+so beqlr - lis r5,CLOCK_REALTIME_RES@h - ori r5,r5,CLOCK_REALTIME_RES@l std r3,TSPC64_TV_SEC(r4) std r5,TSPC64_TV_NSEC(r4) blr @@ -220,7 +223,7 @@ V_FUNCTION_BEGIN(__kernel_time) mr r11,r3 /* r11 holds t */ bl V_LOCAL_FUNC(__get_datapage) - ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) + ld r4,STAMP_XTIME_SEC(r3) cmpldi r11,0 /* check if t is NULL */ beq 2f @@ -265,7 +268,7 @@ V_FUNCTION_BEGIN(__do_get_tspec) mulhdu r6,r6,r5 /* in units of 2^-32 seconds */ /* Add stamp since epoch */ - ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) + ld r4,STAMP_XTIME_SEC(r3) lwz r5,STAMP_SEC_FRAC(r3) or r0,r4,r5 or r0,r0,r6 diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 8eb867dbad5f..25c14a0981bf 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -67,6 +67,9 @@ _GLOBAL(load_up_altivec) #ifdef CONFIG_PPC32 mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ oris r9,r9,MSR_VEC@h +#ifdef CONFIG_VMAP_STACK + tovirt(r5, r5) +#endif #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 060a1acd7c6d..b4c89a1acebb 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -6,6 +6,8 @@ #endif #define BSS_FIRST_SECTIONS *(.bss.prominit) +#define EMITS_PT_NOTE +#define RO_EXCEPTION_TABLE_ALIGN 0 #include <asm/page.h> #include <asm-generic/vmlinux.lds.h> @@ -18,22 +20,8 @@ ENTRY(_stext) PHDRS { - kernel PT_LOAD FLAGS(7); /* RWX */ - notes PT_NOTE FLAGS(0); - dummy PT_NOTE FLAGS(0); - - /* binutils < 2.18 has a bug that makes it misbehave when taking an - ELF file with all segments at load address 0 as input. This - happens when running "strip" on vmlinux, because of the AT() magic - in this linker script. People using GCC >= 4.2 won't run into - this problem, because the "build-id" support will put some data - into the "notes" segment (at a non-zero load address). - - To work around this, we force some data into both the "dummy" - segment and the kernel segment, so the dummy segment will get a - non-zero load address. It's not enough to always create the - "notes" segment, since if nothing gets assigned to it, its load - address will be zero. */ + text PT_LOAD FLAGS(7); /* RWX */ + note PT_NOTE FLAGS(0); } #ifdef CONFIG_PPC64 @@ -77,7 +65,7 @@ SECTIONS #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif - } :kernel + } :text __head_end = .; @@ -126,7 +114,7 @@ SECTIONS __got2_end = .; #endif /* CONFIG_PPC32 */ - } :kernel + } :text . = ALIGN(ETEXT_ALIGN_SIZE); _etext = .; @@ -175,17 +163,6 @@ SECTIONS __stop__btb_flush_fixup = .; } #endif - EXCEPTION_TABLE(0) - - NOTES :kernel :notes - - /* The dummy segment contents for the bug workaround mentioned above - near PHDRS. */ - .dummy : AT(ADDR(.dummy) - LOAD_OFFSET) { - LONG(0) - LONG(0) - LONG(0) - } :kernel :dummy /* * Init sections discarded at runtime @@ -200,7 +177,7 @@ SECTIONS #ifdef CONFIG_PPC64 *(.tramp.ftrace.init); #endif - } :kernel + } :text /* .exit.text is discarded at runtime, not link time, * to deal with references from __bug_table @@ -346,7 +323,7 @@ SECTIONS #endif /* The initial task and kernel stack */ - INIT_TASK_DATA_SECTION(THREAD_SIZE) + INIT_TASK_DATA_SECTION(THREAD_ALIGN) .data..page_aligned : AT(ADDR(.data..page_aligned) - LOAD_OFFSET) { PAGE_ALIGNED_DATA(PAGE_SIZE) |